This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. Html_fragmented is quicker to load than html, hence for this pipeline which genenerated a large mount of data, we have used html_fragment to report Rmarkdown.
#1) GEO series number from publication or seatching GEO for dataset of interest, here GSE=GSE33000 and GSE=GSE43490 #2) The GPL gene annotation file, here GPL=GPL4372 for GSE33000 and GPL=GPL6480 for GSE43490 #3) To reuse code for other studies replace the GEO and GPL numbers #4) metadata file made by manually combining individual metadata files from GEO to give ‘merge_GSE33000_GSE43490_metadata.csv’ #5) metadata coded file made manually from ‘merge_GSE33000_GSE43490_metadata_SampleID.csv’ outputed at end of merge code Step 12 to give ‘merge_GSE33000_GSE43490_metadata_SampleID_Disease_single_coded’ code used is Study GSE33000=1, GSE43490=2; Gender Female=1, Male=2 ; Disease CON=1, DIS=2 #6) run code till step 28 and then based on cell type and trait associated module replace in code “black” or “HuAgeDis_07” with module name that is associated with trait or disease of interest. And replace hub gene names. #7) WGCNA assigns colors randomly (except few reserved colors like “grey” for unassigned genes). Therefore, upon re-run of this analysis it may call the age and disease associated microglia enriched “black” module by some other color such as “pink”, in which case this color is specified step 29 onwards as described in point 6) above. #8) The present analysis was done on MacOS, using knitToHtmlfragment.
#9)To reuse this code for other datasets a) replace 1) and 2) input files above with equivalent files for the dataset b) replace “HuAgeDis_” for module naming to pre-fix of users choice that is meaningful for the dataset. Here Hu=Human, Age=Age, Dis=Disease c) in this pipeline human gene symbols are used
Getting raw data GSE33000
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/HuAgeDis11-30-18"
#Install packages by uncommenting two lines below if packages not already installed before
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("GEOquery","Biobase", "limma", "R.utils", "lumi", "DT"))
library(Biobase)
library(GEOquery)
library(lumi)
library(R.utils)
library(DT)
library(limma)
library(sva)
library(pamr)
#At start of pipeline no input files are needed except the .Rproj which sets working directory to current directory and keeps R environment independent of other folders and the .Rmd R code
list.files()
## [1] "HumanAgeDis.Rmd" "HumanAgeDis.Rproj" "InputHuAgeDis"
#2.1 Get GE0 data as R object
# Saving series and platform gene annotation data from GEO to gset object
gset <- getGEO("GSE33000", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL4372", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
#2.2 keep only samples of interest
#Interested in all samples so keeping everything.
#2.3 Getting GEO metadata of interest
#Names of available phenotype data
names(pData(gset))
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "treatment_protocol_ch1"
## [13] "growth_protocol_ch1" "molecule_ch1"
## [15] "extract_protocol_ch1" "label_ch1"
## [17] "label_protocol_ch1" "taxid_ch1"
## [19] "source_name_ch2" "organism_ch2"
## [21] "characteristics_ch2" "characteristics_ch2.1"
## [23] "characteristics_ch2.2" "characteristics_ch2.3"
## [25] "treatment_protocol_ch2" "growth_protocol_ch2"
## [27] "molecule_ch2" "extract_protocol_ch2"
## [29] "label_ch2" "label_protocol_ch2"
## [31] "taxid_ch2" "hyb_protocol"
## [33] "scan_protocol" "description"
## [35] "data_processing" "platform_id"
## [37] "contact_name" "contact_email"
## [39] "contact_department" "contact_institute"
## [41] "contact_address" "contact_city"
## [43] "contact_state" "contact_zip/postal_code"
## [45] "contact_country" "supplementary_file"
## [47] "data_row_count" "age:ch2"
## [49] "disease status:ch2" "gender:ch2"
## [51] "sample type:ch1" "tissue:ch1"
## [53] "tissue:ch2"
metadata=data.frame(gset$geo_accession, gset$source_name_ch1, gset$`tissue:ch1`, gset$`gender:ch2`, gset$`age:ch2`,gset$`disease status:ch2`)
head(metadata)
## gset.geo_accession gset.source_name_ch1 gset..tissue.ch1.
## 1 GSM1423780 HBTRC_PF_Pool_1 prefrontal cortex brain
## 2 GSM1423781 HBTRC_PF_Pool_2 prefrontal cortex brain
## 3 GSM1423782 HBTRC_PF_Pool_3 prefrontal cortex brain
## 4 GSM1423783 HBTRC_PF_Pool_4 prefrontal cortex brain
## 5 GSM1423784 HBTRC_PF_Pool_5 prefrontal cortex brain
## 6 GSM1423785 HBTRC_PF_Pool_6 prefrontal cortex brain
## gset..gender.ch2. gset..age.ch2. gset..disease.status.ch2.
## 1 female 67 yrs Alzheimer's disease
## 2 male 88 yrs Alzheimer's disease
## 3 male 62 yrs Alzheimer's disease
## 4 female 90 yrs Alzheimer's disease
## 5 female 90 yrs Alzheimer's disease
## 6 female 95 yrs Alzheimer's disease
write.csv(metadata,'GSE33000_metadata.csv')
#2.4 Get GE0 raw expression data.
DT::datatable(exprs(gset)[1:3,])
#The expression data uploaded for this entry is Agilent dual color cy5 cy3 label microarray we will get this data
exprList<-exprs(gset)
write.table(exprList, "GSE33000_expression_data_originalfromGEO.txt", sep="\t", quote=F)
#3.1 Reading and formatting expression data for normalization
#Now we read in the raw expression values
exprRaw<-read.delim('GSE33000_expression_data_originalfromGEO.txt',header=T, sep='\t')
DT::datatable(exprRaw[1:3,])
#The original GSE33000 is in log base 10 log10 ratio (Cy5/Cy3) so lets convert it back to non-log scale. This we can determine by looking at the 1row-1column of the exprRaw datatable above and the 1st datapoint online reference to this data file both of which are -1.618405e-02 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1423780
#conversiton back to non-log scale
exprRaw1 = 10^exprRaw
DT::datatable(exprRaw1[1:3,])
write.table(exprRaw1, "GSE33000_expression_data_non-log.txt", sep="\t", quote=F)
#3.2 Using lumi for quantile normalization
#Perform qualtile normalization on the raw expression data should be matrix format.
exprLumi<-lumiN(as.matrix(exprRaw1),method="quantile")
## Perform quantile normalization ...
DT::datatable(exprLumi[1:3,])
#3.3 log2+1 transformation
#log2+1 transform the expression data. This step also changes it back to data frame format
exprLog<-log2(exprLumi+1)
DT::datatable(exprLog[1:3,])
#3.4 Check for negative values. Negative values will not work in WGCNA
exprRaw_ifneg<-apply(exprRaw, 1, function(row) any(row <0))
length(which(exprRaw_ifneg)) #what is the length of negative numbers
## [1] 39262
exprRaw1_ifneg<-apply(exprRaw1, 1, function(row) any(row <0))
length(which(exprRaw1_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
exprLumi_ifneg<-apply(exprLumi, 1, function(row) any(row <0))
length(which(exprLumi_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
exprLog_ifneg<-apply(exprLog, 1, function(row) any(row <0))
length(which(exprLog_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
#3.5 Check for max and min values. Ususally don’t want max to be <100
exprRaw_max<-which.max(as.matrix(exprRaw))
exprRaw_max
## [1] 26420
exprRaw_min<-which.min(as.matrix(exprRaw))
exprRaw_min
## [1] 151436
exprRaw1_max<-which.max(as.matrix(exprRaw1))
exprRaw1_max
## [1] 26420
exprRaw1_min<-which.min(as.matrix(exprRaw1))
exprRaw1_min
## [1] 151436
exprLumi_max<-which.max(as.matrix(exprLumi))
exprLumi_max
## [1] 110960
exprLumi_min<-which.min(as.matrix(exprLumi))
exprLumi_min
## [1] 9190
exprLog_max<-which.max(as.matrix(exprLog))
exprLog_max
## [1] 110960
exprLog_min<-which.min(as.matrix(exprLog))
exprLog_min
## [1] 9190
#3.6 Descriptive or summary statistics of the data
DT::datatable(summary(exprRaw))
DT::datatable(summary(exprRaw1))
DT::datatable(summary(exprLumi))
DT::datatable(summary(exprLog))
#3.7 Visualization of GSE33000 alone
pdf(file="Visualization_GSE33000_rawdata_transformation.pdf",height=5,width=5)
par(mfrow=c(2,2))
#exprRaw boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprRaw, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprRaw1 boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprRaw1, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprLumi boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprLumi, outline=FALSE, las=2, cex=0.25, main="exprLumi", col="yellow")
#exprLog boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprLog, outline=FALSE, las=2, cex=0.25, main="exprLog", col="yellow")
#exprRaw MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprRaw, legend= "all", main="exprRaw Study", cex=0.5)#like PCA plot
#exprRaw1 MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprRaw1, legend= "all", main="exprRaw1 Study", cex=0.5)#like PCA plot
#exprLumi MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprLumi, legend= "all", main="exprLumi Study", cex=0.5)#like PCA plot
#exprLog MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprLog, legend= "all", main="exprLog Study", cex=0.5)#like PCA plot
dev.off()
## quartz_off_screen
## 2
#4.1 Getting GPL annotation file
GPLid<-annotation(gset)
GPL_file<-getGEO(GPLid)
## File stored at:
## /var/folders/0g/5tz57sgx7090n5vttfqvs3ym0000gn/T//Rtmpv5hBUt/GPL4372.soft
colnames(GPL_file@dataTable@table)
## [1] "ID" "EntrezGeneID"
## [3] "ORF" "GB_ACC"
## [5] "PROBE_DERIVED_FROM_TRANSCRIPT" "SPOT_ID"
## [7] "RosettaGeneModelID"
write.table(GPL_file@dataTable@table,"GPL4372.txt", sep='\t', quote=F)
#We pick the ID or reporter ID and ORF or gene symbols from options displayed above. We need this to annotate expression data. Will use ID to combine.
GPL_file1<-GPL_file@dataTable@table[,c("ID","ORF")]
DT::datatable(head(GPL_file1))
#4.2 preparing gene expression data exprLog for merging with annotation file
exprGene=exprLog
#colnames already are sample GSM id
DT::datatable(head(exprGene))
#Add "ID" column that we will use for merging with annotation
exprGene1=cbind(exprGene, rownames(exprGene))
colnames(exprGene1)=c(colnames(exprGene),"ID")
DT::datatable(head(exprGene1))
#4.3 get gene expression data with gene symbols. GEO annotation is static and ensures that annotation of genes does not change over time.
#This is complete gene expression with Gene symbol annotations
#Save and use columns for pipelines as needed
exprGene2=merge(exprGene1, GPL_file1, by="ID")
dim(exprGene2)
## [1] 39280 626
DT::datatable(head(exprGene2))
write.csv(exprGene2, "GSE33000_Annotation_Expr_HuGene.csv")
save.image(file="preSVALM_GSE33000_temp.RData")
Getting raw data GSE43490
#6.1 Get GE0 data as R object
# Saving series and platform gene annotation data from GEO to gset object
gset <- getGEO("GSE43490", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL6480", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
#6.2 keep only samples of interest
#these groups were selected on the GEO2R to keep only sunstanitia niagra samples
gsms <- "XXXXXXXXXXXXXX1111111100000XXXXXXXXXXXXXX"
sml <- c()
for (i in 1:nchar(gsms)) { sml[i] <- substr(gsms,i,i) }
# eliminate samples marked as "X"
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
#6.3 Getting GEO metadata of interest
#Names of available phenotype data
names(pData(gset))
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "characteristics_ch1.2"
## [13] "characteristics_ch1.3" "treatment_protocol_ch1"
## [15] "molecule_ch1" "extract_protocol_ch1"
## [17] "label_ch1" "label_protocol_ch1"
## [19] "taxid_ch1" "hyb_protocol"
## [21] "scan_protocol" "description"
## [23] "data_processing" "platform_id"
## [25] "contact_name" "contact_email"
## [27] "contact_phone" "contact_laboratory"
## [29] "contact_department" "contact_institute"
## [31] "contact_address" "contact_city"
## [33] "contact_state" "contact_zip/postal_code"
## [35] "contact_country" "supplementary_file"
## [37] "data_row_count" "age:ch1"
## [39] "disease stage:ch1" "disease state:ch1"
## [41] "gender:ch1"
metadata=data.frame(gset$geo_accession, gset$source_name_ch1, gset$`gender:ch1`, gset$`age:ch1`,gset$`disease state:ch1`, gset$`disease stage:ch1`)
head(metadata)
## gset.geo_accession gset.source_name_ch1 gset..gender.ch1.
## 1 GSM1294118 brain, substantia_nigra Female
## 2 GSM1294119 brain, substantia_nigra Male
## 3 GSM1294120 brain, substantia_nigra Male
## 4 GSM1294121 brain, substantia_nigra Female
## 5 GSM1294122 brain, substantia_nigra Female
## 6 GSM1294123 brain, substantia_nigra Male
## gset..age.ch1. gset..disease.state.ch1. gset..disease.stage.ch1.
## 1 85 Parkinson's disease Braak 4
## 2 64 Parkinson's disease Braak 4
## 3 66 Parkinson's disease Braak 4
## 4 80 Parkinson's disease Braak 4
## 5 84 Parkinson's disease Braak 4
## 6 68 Parkinson's disease Braak 5
write.csv(metadata,'GSE43490_metadata.csv')
#6.4 Get GE0 raw expression data.
DT::datatable(exprs(gset)[1:3,])
#The expression data uploaded for this entry is Agilent single color cy3 label we will get this data
exprList<-exprs(gset)
write.table(exprList, "GSE43490_expression_data_originalfromGEO.txt", sep="\t", quote=F)
#7.1 Reading and formatting expression data normalization
#Now we read in the raw expression values
exprRaw<-read.delim('GSE43490_expression_data_originalfromGEO.txt',header=T, sep='\t')
DT::datatable(exprRaw[1:3,])
#The original GSE43490 is in log base 2 log2 single color Cy3 Lowess method normalized so lets convert it back to non-log scale. This we can determine by looking at the 1row-1column of the exprRaw datatable above and the 1st datapoint online reference to this data file both of which are 5.800456543 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1294104
#conversiton back to non-log scale
exprRaw1 = 2^exprRaw
DT::datatable(exprRaw1[1:3,])
write.table(exprRaw1, "GSE43490_expression_data_non-log.txt", sep="\t", quote=F)
#7.2 Using lumi for quantile normalization
#Perform qualtile normalization on the raw expression data should be matrix format.
exprLumi<-lumiN(as.matrix(exprRaw1),method="quantile")
## Perform quantile normalization ...
DT::datatable(exprLumi[1:3,])
#7.3 log2+1 transformation
#log2+1 transform the expression data. This step also changes it back to data frame format
exprLog<-log2(exprLumi+1)
DT::datatable(exprLog[1:3,])
#7.4 Check for negative values. Negative values will not work in WGCNA
exprRaw_ifneg<-apply(exprRaw, 1, function(row) any(row <0))
length(which(exprRaw_ifneg)) #what is the length of negative numbers
## [1] 0
exprRaw1_ifneg<-apply(exprRaw1, 1, function(row) any(row <0))
length(which(exprRaw1_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
exprLumi_ifneg<-apply(exprLumi, 1, function(row) any(row <0))
length(which(exprLumi_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
exprLog_ifneg<-apply(exprLog, 1, function(row) any(row <0))
length(which(exprLog_ifneg)) #what is the length of negative numbers
## [1] 0
#0 means no negative values
#7.5 Check for max and min values. Ususally don’t want max to be <100
exprRaw_max<-which.max(as.matrix(exprRaw))
exprRaw_max
## [1] 151507
exprRaw_min<-which.min(as.matrix(exprRaw))
exprRaw_min
## [1] 3548
exprRaw1_max<-which.max(as.matrix(exprRaw1))
exprRaw1_max
## [1] 151507
exprRaw1_min<-which.min(as.matrix(exprRaw1))
exprRaw1_min
## [1] 3548
exprLumi_max<-which.max(as.matrix(exprLumi))
exprLumi_max
## [1] 4108
exprLumi_min<-which.min(as.matrix(exprLumi))
exprLumi_min
## [1] 3548
exprLog_max<-which.max(as.matrix(exprLog))
exprLog_max
## [1] 4108
exprLog_min<-which.min(as.matrix(exprLog))
exprLog_min
## [1] 3548
#7.6 Visualization of GSE43490 alone
pdf(file="Visualization_GSE43490_rawdata_transformation.pdf",height=5,width=5)
par(mfrow=c(2,2))
#exprRaw boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprRaw, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprRaw1 boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprRaw1, outline=FALSE, las=2, cex=0.25, main="exprRaw", col="yellow")
#exprLumi boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprLumi, outline=FALSE, las=2, cex=0.25, main="exprLumi", col="yellow")
#exprLog boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(exprLog, outline=FALSE, las=2, cex=0.25, main="exprLog", col="yellow")
#exprRaw MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprRaw, legend= "all", main="exprRaw Study", cex=0.5)#like PCA plot
#exprRaw1 MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprRaw1, legend= "all", main="exprRaw1 Study", cex=0.5)#like PCA plot
#exprLumi MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprLumi, legend= "all", main="exprLumi Study", cex=0.5)#like PCA plot
#exprLog MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(exprLog, legend= "all", main="exprLog Study", cex=0.5)#like PCA plot
dev.off()
## quartz_off_screen
## 2
#8.1 Getting GPL annotation file
GPLid<-annotation(gset)
GPL_file<-getGEO(GPLid)
## File stored at:
## /var/folders/0g/5tz57sgx7090n5vttfqvs3ym0000gn/T//Rtmpv5hBUt/GPL6480.soft
colnames(GPL_file@dataTable@table)
## [1] "ID" "SPOT_ID" "CONTROL_TYPE"
## [4] "REFSEQ" "GB_ACC" "GENE"
## [7] "GENE_SYMBOL" "GENE_NAME" "UNIGENE_ID"
## [10] "ENSEMBL_ID" "TIGR_ID" "ACCESSION_STRING"
## [13] "CHROMOSOMAL_LOCATION" "CYTOBAND" "DESCRIPTION"
## [16] "GO_ID" "SEQUENCE"
write.table(GPL_file@dataTable@table,"GPL6480.txt", sep='\t', quote=F)
#We pick the ID or reporter ID and ORF or gene symbols from options displayed above. We need this to annotate expression data. Will use ID to combine.
GPL_file1<-GPL_file@dataTable@table[,c("ID","GENE_SYMBOL")]
DT::datatable(head(GPL_file1))
#8.2 preparing gene expression data exprLog for merging with annotation file
exprGene=exprLog
#colnames already are sample GSM id
DT::datatable(head(exprGene))
#Add "ID" column that we will use for merging with annotation
exprGene1=cbind(exprGene, rownames(exprGene))
colnames(exprGene1)=c(colnames(exprGene),"ID")
DT::datatable(head(exprGene1))
#8.3 get gene expression data with gene symbols. GEO annotation is static and ensures that annotation of genes does not change over time.
#This is complete gene expression with Gene symbol annotations
#Save and use columns for pipelines as needed
exprGene2=merge(exprGene1, GPL_file1, by="ID")
dim(exprGene2)
## [1] 22627 15
DT::datatable(head(exprGene2))
write.csv(exprGene2, "GSE43490_Annotation_Expr_HuGene.csv")
#The files we needed are saved so we now clear workspace and delete files and folders that are not needed
save.image(file="preSVALM_GSE43490_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8224275 439.3 26482191 1414.4 NA 22198464 1185.6
## Vcells 22624544 172.7 320402187 2444.5 102400 400418118 3055.0
Merging GSE33000 and GSE43490
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/HuAgeDis11-30-18"
#Install packages by uncommenting two lines below if packages not already installed before
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("tidyr","dplyr"))
library(dplyr) # used for general data wrangling
library(tidyr) # used for tidying tables
GSE33000Expr=read.csv("GSE33000_Annotation_Expr_HuGene.csv", header =T, sep=',', na.strings = c("", "NA"))
GSE33000Expr=GSE33000Expr %>% na.omit()
GSE43490Expr=read.csv("GSE43490_Annotation_Expr_HuGene.csv", header =T, sep=',', na.strings = c("", "NA"))
GSE43490Expr=GSE43490Expr %>% na.omit()
DT::datatable(GSE33000Expr[1:7,1:7])
DT::datatable(GSE43490Expr[1:7,1:7])
dim(GSE33000Expr)
## [1] 21159 627
dim(GSE43490Expr)
## [1] 11776 16
#Rename the last column name of both datasets to same 'GeneHu'
colnames(GSE33000Expr)[colnames(GSE33000Expr)=='ORF']='GeneHu'
colnames(GSE43490Expr)[colnames(GSE43490Expr)=='GENE_SYMBOL']='GeneHu'
mergeExpr=merge(GSE33000Expr,GSE43490Expr, by='GeneHu')
dim(mergeExpr)
## [1] 11762 642
dim(GSE33000Expr)
## [1] 21159 627
dim(GSE43490Expr)
## [1] 11776 16
#Except one ID column and one gene column, remove other non-sample annotation columns.
#Here picked GSE33000 ID column and gene column to save. Since in microarray probe IDs are unique genes are not unique
mergeExpr=mergeExpr[-c(2,628,629)]
dim(mergeExpr)
## [1] 11762 639
merge_metadata=read.csv('./InputHuAgeDis/merge_GSE33000_GSE43490_metadata.csv', header =T, sep=',')
DT::datatable(head(merge_metadata))
#Number of rows of above metadata is same as number of columns of Expr data +2 for GeneHu and probe IDs
dim(mergeExpr)
## [1] 11762 639
dim(merge_metadata)
## [1] 637 7
#Replace of column names in expression with sample names
colnames(mergeExpr) <- merge_metadata$Sample_name[match(colnames(mergeExpr), merge_metadata$GEO_Accession)]
colnames(mergeExpr)[1]<-"GeneHu"
colnames(mergeExpr)[2]<-"IDprobe"
#Lets drop the GEO accession number from the metadata now
merge_metadata=merge_metadata[,-c(1)]
DT::datatable(head(merge_metadata))
write.csv(mergeExpr,"merge_GSE33000_GSE43490_Expr_GeneHu_SampleID.csv")
write.csv(merge_metadata,"merge_GSE33000_GSE43490_metadata_SampleID.csv")
save.image(file="preSVALM_merge_GSE33000_GSE43490_temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8226011 439.4 26482191 1414.4 NA 22198464 1185.6
## Vcells 22629699 172.7 256321749 1955.6 102400 400418118 3055.0
SVA+LM normalization for study or experiment or batch correction and removal of gender
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/HuAgeDis11-30-18"
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
#Metadata_coded
metadata_coded=read.csv("./InputHuAgeDis/merge_GSE33000_GSE43490_metadata_SampleID_Disease_single_coded.csv", header=T, sep=',')
#Expression data
Expr_HuGene=read.csv("merge_GSE33000_GSE43490_Expr_GeneHu_SampleID.csv", header =T, sep=',')
dim(metadata_coded)
## [1] 637 7
dim(Expr_HuGene)
## [1] 11762 640
#Need to make sure that the Gene IDs are unique to specify rownames with Gene IDs
Expr_HuGene = Expr_HuGene[!duplicated(Expr_HuGene$GeneHu),]
dim(metadata_coded)
## [1] 637 7
dim(Expr_HuGene)
## [1] 7301 640
#We lost some gene IDs are there are multiple probeIDs for genes IDs in microarray
Expr_HuGene_1=Expr_HuGene[,-c(1,2,3)] #get rid of some extra probe ID columns
rownames(Expr_HuGene_1)=Expr_HuGene$GeneHu
Expr_HuGene_1=Expr_HuGene_1[complete.cases(Expr_HuGene_1), ]
edata=Expr_HuGene_1
metadata_coded_1=metadata_coded[,3:7]
rownames(metadata_coded_1)=metadata_coded$Sample_name
pheno=metadata_coded_1
DT::datatable(pheno)
DT::datatable(edata[1:7,1:7])
dim(metadata_coded)
## [1] 637 7
dim(pheno)
## [1] 637 5
dim(Expr_HuGene)
## [1] 7301 640
dim(edata)
## [1] 7301 637
#Make sure we did not lose any samples in expression data
#Here in the output the 1st two and the last two should be same i.e. the first and last samples are same
colnames(Expr_HuGene[4])
## [1] "F_67_AD_GSM1423780"
colnames(edata[1])
## [1] "F_67_AD_GSM1423780"
colnames(Expr_HuGene[640])
## [1] "F_70_CON_GSM1294130"
colnames(edata[637])
## [1] "F_70_CON_GSM1294130"
#full model matrix, variable of interest and other variables
#Put categorical variables ahead of numeric variables
#This dataset has two tissue sources.But the tissue here is also correlated with batch i.e. the two tissue is from two batches so this introduces computaational singularity and cannot be modeleed. So one of two such covariants can be used in mod. So here Study or batch is used and Tissue is not included in the model
mod = model.matrix(~Study+Gender+Disease+Age, data=pheno)
#null model matrix all except the variable of interest
#To include only an intercept use mode 'mod0 = model.matrix(~1,data=pheno)'
#Put categorical variables ahead of numeric variables
mod0 = model.matrix(~Study+Gender,data=pheno)
#Estimation of Surrogate variable
n.sv = num.sv(as.matrix(edata),mod,method="be")
svobj = sva(as.matrix(edata),mod,mod0,n.sv=n.sv, B=20)
## Number of significant surrogate variables is: 40
## Iteration (out of 20 ):1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
sva function returns a list of 4 components: 1) sv= matrix with columns corrosponding to estimate of surrogate variables 2) pprob.gam=probability that each gene is associated with one or more latent variables 3) pprob.b=probability that each gene is associated with our variable of interest 4) n.sv=number of surrogate variables
#Note here the column which we care about is not used pheno[c(-?)], Age in 3rd column, Disease in 4th column
pheno_sva=cbind(pheno[-c(3,4)],svobj$sv)
reglm_sva<-lapply(1:nrow(edata), function(x){lm(unlist(edata[x,])~.,data=pheno_sva)})
DT::datatable(pheno_sva)
residuals<-lapply(reglm_sva, function(x)residuals(summary(x)))
residuals<-do.call(rbind, residuals)
edata_adjresiduals<-residuals+matrix(apply(edata, 1, mean), nrow=nrow(residuals), ncol=ncol(residuals))
rownames(edata_adjresiduals)=rownames(edata)
rownames(residuals)=rownames(edata)
DT::datatable(edata_adjresiduals[1:7,1:7])
dim(edata_adjresiduals)
## [1] 7301 637
write.csv(edata_adjresiduals, "edata_adjresiduals.csv")
#Save .RData and clear environment to free up memory
save.image(file="temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8429130 450.2 26482191 1414.4 NA 26482191 1414.4
## Vcells 22996190 175.5 490474000 3742.1 102400 570842476 4355.2
#This 'reglm_sva' is a big object so we remove it as we have saved the other variables we need
load(file="temp.RData")
rm(reglm_sva)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8439126 450.7 26482191 1414.4 NA 26482191 1414.4
## Vcells 46401020 354.1 472528580 3605.2 102400 570842476 4355.2
edata #After SVA and LM adjustment edata_adjresiduals
#Summary Statistics edata
DT::datatable(summary(edata[1:7,1:7]))
write.csv(summary(edata),"summary_stats_edata.csv")
#Summary Statistics edata_adjresiduals
DT::datatable(summary(edata_adjresiduals[1:7,1:7]))
write.csv(summary(edata_adjresiduals),"summary_stats_edata_adjresiduals.csv")
pdf(file="Visualization_GSE33000_GSE43490_batch_before_and_after_SVA-LM.pdf",height=10,width=10)
par(mfrow=c(2,2))
#Before Correlation
# Colorbar along the heatmap represents Study or batch
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
heatmap.2(cor(edata), RowSideColors=condition_colors,
trace='none', cexRow=0.5, main='Sample correlations before SVA LM Study')
#After Correlation
# Colorbar along the heatmap represents Study
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]
heatmap.2(cor(edata_adjresiduals), RowSideColors=condition_colors,
trace='none', cexRow=0.5, main='Sample correlations after SVA LM Study')
#Before boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(edata, outline=FALSE, las=2, cex=0.25, main="before SVA LM Study", col="yellow")
#After boxplot
par(mai=c(1,0.8,1,0.8))
boxplot(edata_adjresiduals, outline=FALSE, las=2, cex=0.25, main="after SVA LM Study", col="yellow")
#Before MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(edata, col=condition_colors, legend= "all", main="before SVA LM Study", cex=0.5)#like PCA plot
#After MDSplot, colored by Study
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_adjresiduals, col=condition_colors, legend= "all", main="after SVA LM Study", cex=0.5)#like PCA plot
#Before PCAplot, colored by Study or batch
pca0=prcomp(t(edata), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## F_67_AD_GSM1423780 -10.89664 -5.974611 12.119206
## M_88_AD_GSM1423781 -13.14602 -6.850155 2.329893
## M_62_AD_GSM1423782 -12.19850 -20.985231 3.787954
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study or batch before SVA LM",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=pheno$Age,
col=pheno$Study,
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE33000", "GSE43490"),
text.col=c("red", "blue")
)
#After PCAplot, colored by Study or batch
pca0=prcomp(t(edata_adjresiduals), center=T, scale=T)
names(pca0)
## [1] "sdev" "rotation" "center" "scale" "x"
#summary(pca0)
pca0$x[1:3,1:3]
## PC1 PC2 PC3
## F_67_AD_GSM1423780 -3.39660 7.322029 0.7392155
## M_88_AD_GSM1423781 -39.11319 25.531541 -1.4343964
## M_62_AD_GSM1423782 -46.60163 0.569492 -8.6183091
plot(x=pca0$x[,1], y=pca0$x[,2],
cex=0,
xlab="PC1", ylab="PC2",
main="PC plot colored by Study or batch after SVA LM",
font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
labels=pheno$Age,
col=pheno$Study,
cex=0.7, font=2)
legend("bottomright",
legend=c("GSE33000", "GSE43490"),
text.col=c("red", "blue")
)
dev.off()
## quartz_off_screen
## 2
#Export edata_adjustedresidual SVA and LM normalized data for WGCNA
write.csv(edata_adjresiduals, "AgeDiseaseInterest_edata_adjresiduals_GeneID.csv")
save.image(file="SVALM_temp.RData")
Human WGCNA Disease and Age
#save working directory location
wd<-getwd()
wd
## [1] "/Users/powermac/Desktop/HuAgeDis11-30-18"
#Load additional functions required for this pipeline
#Reference: Miller, J.A., Horvath, S., and Geschwind, D.H. (2010). Divergence of Human and mouse brain transcriptome highlights Alzheimer disease pathways. Proceedings of the National Academy of Sciences of the United States of America 107, 12698-12703.
write.geneList <- function(PG, filename, allProbes=0, allGenes=0, probe="g")
{
## These functions write a genelist / probelist to a file of geneNames
## USER inputs
# PG = the probe/gene you want written to a gene list
# allProbes = the list of probe names for the above probes
# allGenes = the list of gene names for the corresponding probes
# filename = the filename (can include folder)
# probe = the default ("g") says PG is a gene and doesn''t need to be converted
# to a gene. Otherwise PG is assumed to be a probe and converted
gene = PG
if (probe!="g") {
gene = probe2Gene(PG,allProbes,allGenes)
}
write(gene,filename,sep="\n")
}
cor.test.l = function(x){
## Performs a Pearson correlation on a vector of genes
ct = cor.test(x,var)
return(c(ct$est,ct$p.val))
}
#library(BiocInstaller)
#biocLite("qvalue")
#install.packages(c("impute","dynamicTreeCut","flashClust","Hmisc","WGCNA","stringi","enrichR"))
library(impute)
library(dynamicTreeCut)
library(qvalue)
library(flashClust)
library(Hmisc)
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.66 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=8
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=8
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
library(stringi)
library(stringr)
library(enrichR)#for pathway analysis
options(stringsAsFactors = FALSE)
#Human Metadata
metadata=read.csv("./InputHuAgeDis/merge_GSE33000_GSE43490_metadata_SampleID_Disease_single.csv", header=T, sep=',')
metadata_1=metadata[,3:7]
rownames(metadata_1)=metadata$Sample_name
pheno=metadata_1
#Human Metadata coded
metadata_coded=read.csv("./InputHuAgeDis/merge_GSE33000_GSE43490_metadata_SampleID_Disease_single_coded.csv", header=T, sep=',')
metadata_1_coded=metadata_coded[,3:7]
rownames(metadata_1_coded)=metadata$Sample_name
pheno_coded=metadata_1_coded
#Human Expression data
Expr_HuGene=read.csv("AgeDiseaseInterest_edata_adjresiduals_GeneID.csv", header =T, sep=',')
Expr_HuGene_1=Expr_HuGene[,-1]
rownames(Expr_HuGene_1)=Expr_HuGene$X
Expr_HuGene_1=Expr_HuGene_1[complete.cases(Expr_HuGene_1), ]
edata=Expr_HuGene_1
#Human Aging data
datExprA1g=edata
metadataA1g=pheno
metadataA1g_coded=pheno_coded
#Remove unused variables
rm(metadata)
rm(metadata_1)
rm(pheno)
rm(metadata_coded)
rm(metadata_1_coded)
rm(pheno_coded)
rm(Expr_HuGene)
rm(Expr_HuGene_1)
rm(edata)
DT::datatable(datExprA1g[1:7,1:7])
dim(datExprA1g)
## [1] 7301 637
#Before applying 'goodSamplesGenes'
dim(datExprA1g)
## [1] 7301 637
#check and applying 'goodSamplesGenes' on data A1
gsg = goodSamplesGenes(datExprA1g, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
if (!gsg$allOK)
{
if (sum(!gsg$goodGenes)>0)
#printFlush(paste("Removing genes:", paste(names(datExprA1g)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
#printFlush(paste("Removing samples:", paste(rownames(datExprA1g)[!gsg$goodSamples], collapse = ", ")));
# Removal step
datExprA1g = datExprA1g[gsg$goodSamples, gsg$goodGenes]
}
#After applying 'goodSamplesGenes'
dim(datExprA1g)
## [1] 7301 637
#Transpose data
datExprA1g1=as.data.frame(t(datExprA1g))
names(datExprA1g1)=rownames(datExprA1g)
rownames(datExprA1g1)=names(datExprA1g)
DT::datatable(datExprA1g1[1:7,1:7])
#Calculate variance
var = apply(datExprA1g1, 2, var)
dat = rbind(datExprA1g1,var)
rownames(dat) = c(rownames(datExprA1g1), "variance")
#order columns by variance
dat1 = dat[,order(dat["variance",], decreasing=T)]
#Remove row containing variance values
#Here we select all genes
dat2 = dat1[1:dim(datExprA1g1)[1],1:dim(datExprA1g1)[2]]
#To select only a given number of top genes like top7000 uncomment line below
#dat2 = dat1[1:dim(datExprA1g1)[1],1:7000]
datExprA1g2=as.data.frame(t(dat2))
names(datExprA1g2)=rownames(dat2)
rownames(datExprA1g2)=names(dat2)
#remove unused variables
rm(var)
rm(dat1)
#Comparison of data A1 after variance based selction
dim(datExprA1g)
## [1] 7301 637
DT::datatable(datExprA1g[1:7,1:7])
dim(datExprA1g2)
## [1] 7301 637
DT::datatable(datExprA1g2[1:7,1:7])
here onwards will use datExprA1g2
#softPower estimate after 'goodSamplesgoodGenes' and variance based selection
#Pick power for the data A1
powers = c(c(1:10), seq(from = 12, to=40, by=2))
sftA1 = pickSoftThreshold(datExprA1g2, RsquaredCut=0.80, powerVector = powers, networkType="signed", moreNetworkConcepts=TRUE, verbose = 5, blockSize=5000)
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 637 of 637
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.89500 14.3000 0.960 448.0000 454.0000 485.000
## 2 2 0.84300 6.2100 0.939 319.0000 327.0000 372.000
## 3 3 0.73500 3.3600 0.879 229.0000 236.0000 288.000
## 4 4 0.63000 1.9900 0.861 167.0000 172.0000 225.000
## 5 5 0.48500 1.3200 0.835 122.0000 127.0000 177.000
## 6 6 0.36400 0.8990 0.839 90.2000 93.0000 140.000
## 7 7 0.16300 0.5120 0.779 67.2000 68.8000 112.000
## 8 8 0.00733 0.0944 0.724 50.4000 51.3000 91.000
## 9 9 0.02070 -0.1650 0.715 38.0000 38.2000 74.000
## 10 10 0.11500 -0.4210 0.682 28.9000 28.5000 60.500
## 11 12 0.32100 -0.7430 0.783 17.0000 16.0000 40.900
## 12 14 0.44600 -1.0800 0.791 10.2000 9.1900 28.100
## 13 16 0.51500 -1.2900 0.843 6.2300 5.2700 19.500
## 14 18 0.59500 -1.3700 0.884 3.8800 3.1200 13.700
## 15 20 0.65200 -1.4300 0.913 2.4500 1.8700 9.660
## 16 22 0.69600 -1.4900 0.933 1.5700 1.1100 6.880
## 17 24 0.72900 -1.4900 0.954 1.0200 0.6640 4.930
## 18 26 0.79500 -1.4800 0.974 0.6690 0.4140 3.550
## 19 28 0.81600 -1.4700 0.979 0.4440 0.2540 2.580
## 20 30 0.81300 -1.5000 0.969 0.2980 0.1600 1.870
## 21 32 0.81000 -1.5500 0.942 0.2010 0.1030 1.370
## 22 34 0.78700 -1.5500 0.890 0.1370 0.0645 1.010
## 23 36 0.83700 -1.5600 0.961 0.0946 0.0405 0.763
## 24 38 0.84800 -1.6000 0.956 0.0656 0.0256 0.582
## 25 40 0.80800 -1.6600 0.877 0.0459 0.0162 0.447
## Density Centralization Heterogeneity
## 1 7.04e-01 0.058900 0.0531
## 2 5.01e-01 0.084400 0.1030
## 3 3.61e-01 0.092500 0.1500
## 4 2.62e-01 0.091900 0.1960
## 5 1.92e-01 0.086300 0.2400
## 6 1.42e-01 0.078500 0.2840
## 7 1.06e-01 0.071500 0.3260
## 8 7.92e-02 0.064000 0.3670
## 9 5.98e-02 0.056700 0.4080
## 10 4.54e-02 0.049800 0.4480
## 11 2.67e-02 0.037800 0.5260
## 12 1.60e-02 0.028200 0.6010
## 13 9.80e-03 0.020900 0.6740
## 14 6.09e-03 0.015400 0.7440
## 15 3.85e-03 0.011400 0.8130
## 16 2.47e-03 0.008380 0.8790
## 17 1.60e-03 0.006170 0.9440
## 18 1.05e-03 0.004550 1.0100
## 19 6.98e-04 0.003360 1.0700
## 20 4.68e-04 0.002490 1.1300
## 21 3.17e-04 0.001840 1.1900
## 22 2.16e-04 0.001370 1.2500
## 23 1.49e-04 0.001050 1.3100
## 24 1.03e-04 0.000815 1.3700
## 25 7.21e-05 0.000633 1.4300
par(mfrow = c(1,2));
cex1 = 0.9;
#scale-free topology fit index as function of soft-thresholding power
plot(sftA1$fitIndices[,1], -sign(sftA1$fitIndices[,3])*sftA1$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sftA1$fitIndices[,1], -sign(sftA1$fitIndices[,3])*sftA1$fitIndices[,2],
labels=powers,cex=cex1,col="red");
#this line corresponds to using R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sftA1$fitIndices[,1], sftA1$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sftA1$fitIndices[,1], sftA1$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.print(pdf, "A1_auto-power_plot.pdf",height=10,width=18)
## quartz_off_screen
## 2
#sftA1$powerEstimate is systems best guess
softPowerA1=sftA1$powerEstimate
softPowerA1
## [1] 1
#set soft power to value 12 for signed networks if above automatic detection detects low power.
softPowerA1=12
#Calculate weighted adjacency
adjacencyA1 = adjacency(t(datExprA1g2),power=softPowerA1,type="signed");
diag(adjacencyA1)=0
#Turn adjacency into topology overlap matrix
TOM=TOMsimilarity(adjacencyA1, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOMA1 = 1-TOM
#Hierarchial clustering based on TOM
geneTreeA1 = flashClust(as.dist(dissTOMA1), method="average")
#Display the networks visually:
#pdf("dendrogram.pdf",height=6,width=16)
#par(mar=c(1,1,1,1))
#par(mfrow=c(1,2))
plot(geneTreeA1,xlab="",sub="",main="Gene clustering on TOM-based dissimilarity (A1)",
labels=FALSE,hang=0.04);
#dev.off()
dev.print(pdf,file="dendrogram.pdf",height=5,width=10)
## quartz_off_screen
## 2
#Modules for dataset A1 or modulesA1
mColorh=NULL
mColorhL=NULL
for (ds in 0:3){
tree = cutreeHybrid(dendro = geneTreeA1, pamStage=TRUE,
minClusterSize = (30-3*ds), cutHeight = 0.99,
deepSplit = ds, distM = dissTOMA1)
mColorh=cbind(mColorh,labels2colors(tree$labels));
mColorhL=cbind(mColorhL, paste("HuAgeDis_", str_pad(tree$labels, 2, pad="0"), sep="" ))
}
## ..done.
## ..done.
## ..done.
## ..done.
#pdf("Module_choices.pdf", height=10,width=25);
plotDendroAndColors(geneTreeA1, mColorh, paste("dpSplt =",0:3), main = "",dendroLabels=FALSE);
#dev.off()
dev.print(pdf,file="Module_choices.pdf", height=5,width=10)
## quartz_off_screen
## 2
#Picked deepSplit or dpSplit 2 which gives small number of large modules
modulesA1 = mColorh[,3] #color labels of modules
modulesA1L= mColorhL[,3] #numeric label of modules
#number of genes per module
#Principal component for visualization
PCs1A = moduleEigengenes(t(datExprA1g2), colors=modulesA1)
ME_1A = PCs1A$eigengenes
distPC1A = 1-abs(cor(ME_1A,use="p"))
distPC1A = ifelse(is.na(distPC1A), 0, distPC1A)
pcTree1A = hclust(as.dist(distPC1A),method="a")
MDS_1A = cmdscale(as.dist(distPC1A),2)
colorsA1 = names(table(modulesA1))
par(mfrow=c(1,1), mar=c(0, 3, 1, 1) + 0.1, cex=1)
plot(pcTree1A, xlab="",ylab="",main="",sub="")
dev.print(pdf,file="ModuleEigengeneVisualizationsTree.pdf",height=5,width=10)
## quartz_off_screen
## 2
par(mfrow=c(1,1), mar=c(0, 3, 1, 1) + 0.1, cex=1)
plot(MDS_1A, col= colorsA1, main="MDS plot", cex=2, pch=19)
dev.print(pdf,file="ModuleEigengeneVisualizationsMDS.pdf",height=5,width=5)
## quartz_off_screen
## 2
PCs1AL = moduleEigengenes(t(datExprA1g2), colors=modulesA1L)
ME_1AL = PCs1AL$eigengenes
distPC1AL = 1-abs(cor(ME_1AL,use="p"))
distPC1AL = ifelse(is.na(distPC1AL), 0, distPC1AL)
pcTree1AL = hclust(as.dist(distPC1AL),method="a")
MDS_1AL = cmdscale(as.dist(distPC1AL),2)
colorsA1L = names(table(modulesA1L))
par(mfrow=c(1,1), mar=c(0, 3, 1, 1) + 0.1, cex=1)
plot(pcTree1AL, xlab="",ylab="",main="",sub="")
dev.print(pdf,file="ModuleEigengeneVisualizationsTreelabels.pdf",height=5,width=10)
## quartz_off_screen
## 2
par(mfrow=c(1,1), mar=c(0, 3, 1, 1) + 0.1, cex=1)
#plot(MDS_1AL, col= colorsA1L, main="MDS plot", cex=2, pch=19)
#dev.print(pdf,file="ModuleEigengeneVisualizationsMDSlabels.pdf",height=5,width=5)
#kME to measure of correlations between each gene and each ME
#for data A1
geneModuleMembership1 = signedKME(t(datExprA1g2), ME_1A)
colnames(geneModuleMembership1)=paste("PC",colorsA1,".cor",sep="");
MMPvalue1=corPvalueStudent(as.matrix(geneModuleMembership1),dim(datExprA1g2)[[2]]);
colnames(MMPvalue1)=paste("PC",colorsA1,".pval",sep="");
Gene = rownames(datExprA1g2)
kMEtable1 = cbind(Gene,Gene,modulesA1)
for (i in 1:length(colorsA1))
kMEtable1 = cbind(kMEtable1, geneModuleMembership1[,i], MMPvalue1[,i])
colnames(kMEtable1)=c("PSID","Gene","Module",sort(c(colnames(geneModuleMembership1),
colnames(MMPvalue1))))
write.csv(kMEtable1,"kMEtable1colors.csv")
#Hub genes are genes that show significant correlation with MEs and high within-module connectivity
#kME values are used to compare data A1 to determine which genes have extremely high kME values in networks
#and are thus genes are hubs in both networks
#ref: modified from https://support.bioconductor.org/p/94701/
topGenesKME = NULL
for (c in 1:length(colorsA1)){
kMErank1 = rank(-geneModuleMembership1[,c])
#maxKMErank = rank(apply(cbind(kMErank1,kMErank2+.00001),1,max))
#topGenesKME = cbind(topGenesKME,Gene[maxKMErank<=10])
topGenesKME = cbind(topGenesKME,Gene[order(kMErank1, decreasing=FALSE)][1:10])
}
colnames(topGenesKME) = colorsA1
topGenesKME
## black blue brown green greenyellow grey
## [1,] "TBXAS1" "PRKD3" "RAF1" "LRP10" "CFH" "ANXA4"
## [2,] "TYROBP" "AFF1" "SAFB2" "MAPKAPK2" "ANXA2P1" "PAN3"
## [3,] "PTPRC" "NFE2L2" "MAPKAPK2" "METTL7A" "KIAA1199" "C6orf70"
## [4,] "ITGB2" "PTTG1IP" "PHF19" "CD59" "ANXA2" "KIAA1407"
## [5,] "MYO1F" "DYNLT1" "SIPA1" "SERPINB6" "FRZB" "RAPGEF6"
## [6,] "LST1" "MAPRE1" "AUP1" "INPPL1" "DCN" "ZKSCAN1"
## [7,] "CYBA" "TBC1D2B" "KLHL21" "AFF1" "FBLN5" "EEF1A1"
## [8,] "SLC7A7" "NPC2" "ZNF646" "TBC1D2B" "FMOD" "DNAJC10"
## [9,] "RNASET2" "CD302" "RBM4" "ZFP36L1" "SVIL" "ADD3"
## [10,] "FCER1G" "LRP10" "C1orf144" "TBL1X" "BMP6" "DHTKD1"
## magenta pink purple red turquoise yellow
## [1,] "DCTN3" "SYP" "P4HA2" "ACTR3B" "NRXN3" "HSF1"
## [2,] "PARD6A" "SSBP4" "CHORDC1" "ACOT7" "GOT1" "PHF1"
## [3,] "SUMO3" "ATP6V0C" "STIP1" "C10orf35" "MDH2" "IRF2BP1"
## [4,] "CUEDC2" "RUSC1" "DEDD2" "SVOP" "NRN1" "HECTD3"
## [5,] "RUVBL2" "PARD6A" "DNAJA1" "CLTA" "TOMM20" "FAM89B"
## [6,] "HRAS" "GSK3A" "MKNK2" "LARGE" "NAP1L5" "ANKRD13B"
## [7,] "ASNA1" "LPHN1" "SPR" "NFKBIE" "ATP6AP2" "ADRM1"
## [8,] "COX5A" "DLGAP4" "DNAJB1" "C6orf154" "CMAS" "ZNF574"
## [9,] "MRPL46" "CFL1" "BAG3" "TRIM9" "GPRASP1" "FAM100A"
## [10,] "C17orf81" "CBX4" "HSPH1" "TTC9B" "NKIRAS1" "DEDD"
write.csv(topGenesKME,"A1_HubGenestopGenesKMEcolorsRankSorted.csv")
#These genes or probes represent the top 10 hub genes for modules in data A1 networks.
#Hub genes are genes that show significant correlation with MEs and high within-module connectivity
#kME values are used to compare data A1 to determine which genes have extremely high kME values in networks
#and are thus genes are hubs in both networks
topGenesKME = NULL
for (c in 1:length(colorsA1)){
kMErank1 = rank(-geneModuleMembership1[,c])
maxKMErank = rank(apply(cbind(kMErank1),1,max))
#maxKMErank = rank(apply(cbind(kMErank1,kMErank2+.00001),1,max))
topGenesKME = cbind(topGenesKME,Gene[maxKMErank<=10])
}
colnames(topGenesKME) = colorsA1
topGenesKME
## black blue brown green greenyellow grey
## [1,] "TYROBP" "CD302" "SIPA1" "ZFP36L1" "FMOD" "EEF1A1"
## [2,] "TBXAS1" "LRP10" "PHF19" "CD59" "KIAA1199" "ADD3"
## [3,] "FCER1G" "PTTG1IP" "MAPKAPK2" "METTL7A" "FRZB" "KIAA1407"
## [4,] "ITGB2" "NFE2L2" "KLHL21" "LRP10" "CFH" "RAPGEF6"
## [5,] "PTPRC" "NPC2" "SAFB2" "MAPKAPK2" "ANXA2" "PAN3"
## [6,] "SLC7A7" "PRKD3" "RBM4" "AFF1" "ANXA2P1" "ANXA4"
## [7,] "LST1" "AFF1" "C1orf144" "INPPL1" "DCN" "ZKSCAN1"
## [8,] "MYO1F" "MAPRE1" "ZNF646" "TBL1X" "BMP6" "DHTKD1"
## [9,] "RNASET2" "DYNLT1" "AUP1" "SERPINB6" "SVIL" "C6orf70"
## [10,] "CYBA" "TBC1D2B" "RAF1" "TBC1D2B" "FBLN5" "DNAJC10"
## magenta pink purple red turquoise yellow
## [1,] "PARD6A" "LPHN1" "BAG3" "SVOP" "NRN1" "IRF2BP1"
## [2,] "SUMO3" "CBX4" "DNAJB1" "ACTR3B" "NAP1L5" "FAM100A"
## [3,] "COX5A" "ATP6V0C" "MKNK2" "C6orf154" "NRXN3" "PHF1"
## [4,] "HRAS" "DLGAP4" "CHORDC1" "TRIM9" "TOMM20" "HECTD3"
## [5,] "MRPL46" "SYP" "P4HA2" "TTC9B" "GOT1" "HSF1"
## [6,] "RUVBL2" "CFL1" "HSPH1" "NFKBIE" "GPRASP1" "ANKRD13B"
## [7,] "DCTN3" "SSBP4" "DNAJA1" "CLTA" "NKIRAS1" "ZNF574"
## [8,] "C17orf81" "PARD6A" "SPR" "ACOT7" "ATP6AP2" "ADRM1"
## [9,] "ASNA1" "GSK3A" "DEDD2" "C10orf35" "MDH2" "FAM89B"
## [10,] "CUEDC2" "RUSC1" "STIP1" "LARGE" "CMAS" "DEDD"
write.csv(topGenesKME,"A1_HubGenestopGenesKMEcolorsRankUnsorted.csv")
#These genes or probes represent the top 10 hub genes for modules in data A1 networks.
#kME to measure of correlations between each gene and each ME
#for data A1
geneModuleMembership1L = signedKME(t(datExprA1g2), ME_1AL)
colnames(geneModuleMembership1L)=paste("PC",colorsA1L,".cor",sep="");
MMPvalue1L=corPvalueStudent(as.matrix(geneModuleMembership1L),dim(datExprA1g2)[[2]]);
colnames(MMPvalue1L)=paste("PC",colorsA1L,".pval",sep="");
GeneL = rownames(datExprA1g2)
kMEtable1L = cbind(GeneL,GeneL,modulesA1L)
for (i in 1:length(colorsA1L))
kMEtable1L = cbind(kMEtable1L, geneModuleMembership1L[,i], MMPvalue1L[,i])
colnames(kMEtable1L)=c("PSID","Gene","Module",sort(c(colnames(geneModuleMembership1L),
colnames(MMPvalue1L))))
write.csv(kMEtable1L,"kMEtable1labels.csv")
#Hub genes are genes that show significant correlation with MEs and high within-module connectivity
#kME values are used to compare data A1 to determine which genes have extremely high kME values in networks
#and are thus genes are hubs in both networks
#ref: modified from https://support.bioconductor.org/p/94701/
topGenesKMEL = NULL
for (c in 1:length(colorsA1L)){
kMErank1L = rank(-geneModuleMembership1L[,c])
#maxKMErank = rank(apply(cbind(kMErank1,kMErank2+.00001),1,max))
#topGenesKMEL = cbind(topGenesKMEL,GeneL[maxKMErankL<=10])
topGenesKMEL = cbind(topGenesKMEL,GeneL[order(kMErank1L, decreasing=FALSE)][1:10])
}
colnames(topGenesKMEL) = colorsA1L
topGenesKMEL
## HuAgeDis_00 HuAgeDis_01 HuAgeDis_02 HuAgeDis_03 HuAgeDis_04
## [1,] "ANXA4" "NRXN3" "PRKD3" "RAF1" "HSF1"
## [2,] "PAN3" "GOT1" "AFF1" "SAFB2" "PHF1"
## [3,] "C6orf70" "MDH2" "NFE2L2" "MAPKAPK2" "IRF2BP1"
## [4,] "KIAA1407" "NRN1" "PTTG1IP" "PHF19" "HECTD3"
## [5,] "RAPGEF6" "TOMM20" "DYNLT1" "SIPA1" "FAM89B"
## [6,] "ZKSCAN1" "NAP1L5" "MAPRE1" "AUP1" "ANKRD13B"
## [7,] "EEF1A1" "ATP6AP2" "TBC1D2B" "KLHL21" "ADRM1"
## [8,] "DNAJC10" "CMAS" "NPC2" "ZNF646" "ZNF574"
## [9,] "ADD3" "GPRASP1" "CD302" "RBM4" "FAM100A"
## [10,] "DHTKD1" "NKIRAS1" "LRP10" "C1orf144" "DEDD"
## HuAgeDis_05 HuAgeDis_06 HuAgeDis_07 HuAgeDis_08 HuAgeDis_09
## [1,] "LRP10" "ACTR3B" "TBXAS1" "SYP" "DCTN3"
## [2,] "MAPKAPK2" "ACOT7" "TYROBP" "SSBP4" "PARD6A"
## [3,] "METTL7A" "C10orf35" "PTPRC" "ATP6V0C" "SUMO3"
## [4,] "CD59" "SVOP" "ITGB2" "RUSC1" "CUEDC2"
## [5,] "SERPINB6" "CLTA" "MYO1F" "PARD6A" "RUVBL2"
## [6,] "INPPL1" "LARGE" "LST1" "GSK3A" "HRAS"
## [7,] "AFF1" "NFKBIE" "CYBA" "LPHN1" "ASNA1"
## [8,] "TBC1D2B" "C6orf154" "SLC7A7" "DLGAP4" "COX5A"
## [9,] "ZFP36L1" "TRIM9" "RNASET2" "CFL1" "MRPL46"
## [10,] "TBL1X" "TTC9B" "FCER1G" "CBX4" "C17orf81"
## HuAgeDis_10 HuAgeDis_11
## [1,] "P4HA2" "CFH"
## [2,] "CHORDC1" "ANXA2P1"
## [3,] "STIP1" "KIAA1199"
## [4,] "DEDD2" "ANXA2"
## [5,] "DNAJA1" "FRZB"
## [6,] "MKNK2" "DCN"
## [7,] "SPR" "FBLN5"
## [8,] "DNAJB1" "FMOD"
## [9,] "BAG3" "SVIL"
## [10,] "HSPH1" "BMP6"
write.csv(topGenesKMEL,"A1_HubGenestopGenesKMElabelsRankSorted.csv")
#These genes or probes represent the top 10 hub genes for modules in data A1 networks.
#Hub genes are genes that show significant correlation with MEs and high within-module connectivity
#kME values are used to compare data A1 to determine which genes have extremely high kME values in networks
#and are thus genes are hubs in both networks
topGenesKMEL = NULL
for (c in 1:length(colorsA1L)){
kMErank1L = rank(-geneModuleMembership1L[,c])
maxKMErankL = rank(apply(cbind(kMErank1L),1,max))
#maxKMErank = rank(apply(cbind(kMErank1,kMErank2+.00001),1,max))
topGenesKMEL = cbind(topGenesKMEL,GeneL[maxKMErankL<=10])
}
colnames(topGenesKMEL) = colorsA1L
topGenesKMEL
## HuAgeDis_00 HuAgeDis_01 HuAgeDis_02 HuAgeDis_03 HuAgeDis_04
## [1,] "EEF1A1" "NRN1" "CD302" "SIPA1" "IRF2BP1"
## [2,] "ADD3" "NAP1L5" "LRP10" "PHF19" "FAM100A"
## [3,] "KIAA1407" "NRXN3" "PTTG1IP" "MAPKAPK2" "PHF1"
## [4,] "RAPGEF6" "TOMM20" "NFE2L2" "KLHL21" "HECTD3"
## [5,] "PAN3" "GOT1" "NPC2" "SAFB2" "HSF1"
## [6,] "ANXA4" "GPRASP1" "PRKD3" "RBM4" "ANKRD13B"
## [7,] "ZKSCAN1" "NKIRAS1" "AFF1" "C1orf144" "ZNF574"
## [8,] "DHTKD1" "ATP6AP2" "MAPRE1" "ZNF646" "ADRM1"
## [9,] "C6orf70" "MDH2" "DYNLT1" "AUP1" "FAM89B"
## [10,] "DNAJC10" "CMAS" "TBC1D2B" "RAF1" "DEDD"
## HuAgeDis_05 HuAgeDis_06 HuAgeDis_07 HuAgeDis_08 HuAgeDis_09
## [1,] "ZFP36L1" "SVOP" "TYROBP" "LPHN1" "PARD6A"
## [2,] "CD59" "ACTR3B" "TBXAS1" "CBX4" "SUMO3"
## [3,] "METTL7A" "C6orf154" "FCER1G" "ATP6V0C" "COX5A"
## [4,] "LRP10" "TRIM9" "ITGB2" "DLGAP4" "HRAS"
## [5,] "MAPKAPK2" "TTC9B" "PTPRC" "SYP" "MRPL46"
## [6,] "AFF1" "NFKBIE" "SLC7A7" "CFL1" "RUVBL2"
## [7,] "INPPL1" "CLTA" "LST1" "SSBP4" "DCTN3"
## [8,] "TBL1X" "ACOT7" "MYO1F" "PARD6A" "C17orf81"
## [9,] "SERPINB6" "C10orf35" "RNASET2" "GSK3A" "ASNA1"
## [10,] "TBC1D2B" "LARGE" "CYBA" "RUSC1" "CUEDC2"
## HuAgeDis_10 HuAgeDis_11
## [1,] "BAG3" "FMOD"
## [2,] "DNAJB1" "KIAA1199"
## [3,] "MKNK2" "FRZB"
## [4,] "CHORDC1" "CFH"
## [5,] "P4HA2" "ANXA2"
## [6,] "HSPH1" "ANXA2P1"
## [7,] "DNAJA1" "DCN"
## [8,] "SPR" "BMP6"
## [9,] "DEDD2" "SVIL"
## [10,] "STIP1" "FBLN5"
write.csv(topGenesKMEL,"A1_HubGenestopGenesKMElabelsRankUnsorted.csv")
#These genes or probes represent the top 10 hub genes for modules in data A1 networks.
#Ref: WGCNA tutorial Steve Horvath Peter Langfelder 2011
#intramodular connectivity is mathematically the sum of module edge weights or "degree" of connectivity between a given node or gene and other genes or nodes within the module.
#For colors
IntraModularConnectivity1=intramodularConnectivity(adjacencyA1,modulesA1)
#We get the color labels for the genes
GeneModule1=kMEtable1[,c("PSID","Gene","Module")]
rownames(GeneModule1)=rownames(kMEtable1)
#merge the intramodular connectivity dataframe with module names by rownames that are gene symbols
IntraModularConnectivity2=merge(GeneModule1,IntraModularConnectivity1, by="row.names")
#drop the extra row.name column
IntraModularConnectivity3=IntraModularConnectivity2[,-c(1)]
DT::datatable(head(IntraModularConnectivity3))
#export kWithin and kTotal for all genes
write.csv(IntraModularConnectivity1,"kWithintable1colors.csv")
#Ref: WGCNA tutorial Steve Horvath Peter Langfelder 2011
#intramodular connectivity is mathematically the sum of module edge weights or "degree" of connectivity between a given node or gene and other genes or nodes within the module.
#For colors
IntraModularConnectivity1L=intramodularConnectivity(adjacencyA1,modulesA1L)
#We get the number labels for the genes
GeneModule1L=kMEtable1L[,c("PSID","Gene","Module")]
rownames(GeneModule1L)=rownames(kMEtable1L)
#merge the intramodular connectivity dataframe with module names by rownames that are gene symbols
IntraModularConnectivity2L=merge(GeneModule1L,IntraModularConnectivity1L, by="row.names")
#drop the extra row.name colums
IntraModularConnectivity3L=IntraModularConnectivity2L[,-c(1)]
DT::datatable(head(IntraModularConnectivity3L))
#export kWithin and kTotal for all genes
write.csv(IntraModularConnectivity1L,"kWithintable1labels.csv")
#Make a folder first called 'Module_GeneskMEHubs' change full path below as per your system
dir.create("./Module_GeneskMEHubs")
folder = "Module_GeneskMEHubs/"
for (c in colorsA1){
fn = paste(folder, c, ".txt",sep="");
write.geneList(Gene[modulesA1==c], fn)
};
write(Gene,paste(folder,"all.txt",sep=""))
#The folder should now have files titled "<color>.txt" that contain the gene names of every gene in that module,as well as a file titled "all.txt" that has every gene in your data set.
#make a summary file of all the genes in modules, similar to kME files but with only genes and module names as above
geneListsModuleSummary=cbind(Gene,modulesA1)
write.csv(geneListsModuleSummary,"geneListsModuleSummarycolors.csv")
#Make a folder first called 'Module_GeneskMEHubs' change full path below as per your system
folder = "Module_GeneskMEHubs/"
for (c in colorsA1L){
fn = paste(folder, c, ".txt",sep="");
write.geneList(GeneL[modulesA1L==c], fn)
};
write(GeneL,paste(folder,"allL.txt",sep=""))
#The folder should now have files titled "<color>.txt" that contain the gene names of every gene in that module,as well as a file titled "all.txt" that has every gene in your data set.
#make a summary file of all the genes in modules, similar to kME files but with only genes and module names as above
geneListsModuleSummaryL=cbind(GeneL,modulesA1L)
write.csv(geneListsModuleSummaryL,"geneListsModuleSummarylabels.csv")
# For data A1
# Define numbers of genes and samples
nGenesA1 = nrow(datExprA1g2)
nSamplesA1 = ncol(datExprA1g2)
# Recalculate MEs with color labels
MEs0A1= moduleEigengenes(t(datExprA1g2),modulesA1)$eigengenes
MEsA1= orderMEs(MEs0A1)
modTraitCorA1 = cor(MEsA1, metadataA1g_coded, use = "p")
modTraitPA1 = corPvalueStudent(modTraitCorA1, nSamplesA1)
textMatrixA1 = paste(signif(modTraitCorA1, 2), "\n(",
signif(modTraitPA1, 1), ")", sep = "")
dim(textMatrixA1) = dim(modTraitCorA1)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = modTraitCorA1, xLabels = names(metadataA1g_coded),
yLabels = names(MEsA1), ySymbols = names(MEsA1),
colorLabels =FALSE,colors=greenWhiteRed(50),textMatrix=textMatrixA1,
setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1),
main = paste("A1_Module-trait relationships colors"))
dev.print(pdf,"A1_relating modules to trait colors.pdf", width=5, height=5)
## quartz_off_screen
## 2
#This is for color module trait table
colnames(modTraitPA1) = paste("p.value.", colnames(modTraitCorA1), sep="");
out3<-cbind(Module=rownames(modTraitCorA1), modTraitCorA1, modTraitPA1)
dim(out3)
## [1] 12 11
write.table(out3, "A1_relating modules to trait colors.csv", sep=",",row.names=F)
# For data A1
# Define numbers of genes and samples
nGenesA1L = nrow(datExprA1g2)
nSamplesA1L = ncol(datExprA1g2)
# Recalculate MEs with color labels
MEs0A1L= moduleEigengenes(t(datExprA1g2),modulesA1L)$eigengenes
MEsA1L= orderMEs(MEs0A1L)
modTraitCorA1L = cor(MEsA1L, metadataA1g_coded, use = "p")
modTraitPA1L = corPvalueStudent(modTraitCorA1L, nSamplesA1L)
textMatrixA1L = paste(signif(modTraitCorA1L, 2), "\n(",
signif(modTraitPA1L, 1), ")", sep = "")
dim(textMatrixA1L) = dim(modTraitCorA1L)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = modTraitCorA1L, xLabels = names(metadataA1g_coded),
yLabels = names(MEsA1L), ySymbols = names(MEsA1L),
colorLabels =FALSE,colors=greenWhiteRed(50),textMatrix=textMatrixA1L,
setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1),
main = paste("A1_Module-trait relationships labels"))
dev.print(pdf,"A1_relating modules to trait labels.pdf", width=5, height=5)
## quartz_off_screen
## 2
#This is for label module trait table
colnames(modTraitPA1L) = paste("p.value.", colnames(modTraitCorA1L), sep="");
out3L<-cbind(Module=rownames(modTraitCorA1L), modTraitCorA1L, modTraitPA1L)
dim(out3L)
## [1] 12 11
write.table(out3L, "A1_relating modules to trait labels.csv", sep=",",row.names=F)
save.image(file="temp.RData")
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8682105 463.7 26482191 1414.4 NA 26482191 1414.4
## Vcells 23411406 178.7 418296004 3191.4 102400 570842476 4355.2
#To reload uncomment code below
#load(file="temp.RData")
load(file="temp.RData")
library(Rcpp)
library(stringi)
library(biomaRt)
#The useMart() function can now be used to connect to a specified BioMart database, this must be a valid name given by listMarts(). In the next example we choose to query the Ensembl BioMart database.
listMarts()
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 94
## 2 ENSEMBL_MART_MOUSE Mouse strains 94
## 3 ENSEMBL_MART_SNP Ensembl Variation 94
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 94
ensembl=useMart("ensembl")
listDatasets(ensembl)
## dataset
## 1 acalliptera_gene_ensembl
## 2 acarolinensis_gene_ensembl
## 3 acitrinellus_gene_ensembl
## 4 amelanoleuca_gene_ensembl
## 5 amexicanus_gene_ensembl
## 6 anancymaae_gene_ensembl
## 7 aocellaris_gene_ensembl
## 8 apercula_gene_ensembl
## 9 aplatyrhynchos_gene_ensembl
## 10 apolyacanthus_gene_ensembl
## 11 atestudineus_gene_ensembl
## 12 btaurus_gene_ensembl
## 13 caperea_gene_ensembl
## 14 catys_gene_ensembl
## 15 ccapucinus_gene_ensembl
## 16 cchok1gshd_gene_ensembl
## 17 ccrigri_gene_ensembl
## 18 celegans_gene_ensembl
## 19 cfamiliaris_gene_ensembl
## 20 chircus_gene_ensembl
## 21 choffmanni_gene_ensembl
## 22 cintestinalis_gene_ensembl
## 23 cjacchus_gene_ensembl
## 24 clanigera_gene_ensembl
## 25 cpalliatus_gene_ensembl
## 26 cporcellus_gene_ensembl
## 27 csabaeus_gene_ensembl
## 28 csavignyi_gene_ensembl
## 29 csemilaevis_gene_ensembl
## 30 csyrichta_gene_ensembl
## 31 cvariegatus_gene_ensembl
## 32 dmelanogaster_gene_ensembl
## 33 dnovemcinctus_gene_ensembl
## 34 dordii_gene_ensembl
## 35 drerio_gene_ensembl
## 36 eburgeri_gene_ensembl
## 37 ecaballus_gene_ensembl
## 38 eeuropaeus_gene_ensembl
## 39 elucius_gene_ensembl
## 40 etelfairi_gene_ensembl
## 41 falbicollis_gene_ensembl
## 42 fcatus_gene_ensembl
## 43 fdamarensis_gene_ensembl
## 44 fheteroclitus_gene_ensembl
## 45 gaculeatus_gene_ensembl
## 46 gaffinis_gene_ensembl
## 47 ggallus_gene_ensembl
## 48 ggorilla_gene_ensembl
## 49 gmorhua_gene_ensembl
## 50 hburtoni_gene_ensembl
## 51 hcomes_gene_ensembl
## 52 hfemale_gene_ensembl
## 53 hmale_gene_ensembl
## 54 hsapiens_gene_ensembl
## 55 ipunctatus_gene_ensembl
## 56 itridecemlineatus_gene_ensembl
## 57 jjaculus_gene_ensembl
## 58 kmarmoratus_gene_ensembl
## 59 lafricana_gene_ensembl
## 60 lbergylta_gene_ensembl
## 61 lchalumnae_gene_ensembl
## 62 loculatus_gene_ensembl
## 63 malbus_gene_ensembl
## 64 marmatus_gene_ensembl
## 65 mauratus_gene_ensembl
## 66 mcaroli_gene_ensembl
## 67 mdomestica_gene_ensembl
## 68 mfascicularis_gene_ensembl
## 69 mfuro_gene_ensembl
## 70 mgallopavo_gene_ensembl
## 71 mleucophaeus_gene_ensembl
## 72 mlucifugus_gene_ensembl
## 73 mmola_gene_ensembl
## 74 mmulatta_gene_ensembl
## 75 mmurinus_gene_ensembl
## 76 mmusculus_gene_ensembl
## 77 mnemestrina_gene_ensembl
## 78 mochrogaster_gene_ensembl
## 79 mpahari_gene_ensembl
## 80 mspretus_gene_ensembl
## 81 mzebra_gene_ensembl
## 82 nbrichardi_gene_ensembl
## 83 neugenii_gene_ensembl
## 84 ngalili_gene_ensembl
## 85 nleucogenys_gene_ensembl
## 86 oanatinus_gene_ensembl
## 87 oaries_gene_ensembl
## 88 ocuniculus_gene_ensembl
## 89 odegus_gene_ensembl
## 90 ogarnettii_gene_ensembl
## 91 ohni_gene_ensembl
## 92 ohsok_gene_ensembl
## 93 olatipes_gene_ensembl
## 94 omelastigma_gene_ensembl
## 95 oniloticus_gene_ensembl
## 96 oprinceps_gene_ensembl
## 97 pabelii_gene_ensembl
## 98 paltaica_gene_ensembl
## 99 panubis_gene_ensembl
## 100 pbairdii_gene_ensembl
## 101 pcapensis_gene_ensembl
## 102 pcoquereli_gene_ensembl
## 103 pformosa_gene_ensembl
## 104 pkingsleyae_gene_ensembl
## 105 platipinna_gene_ensembl
## 106 pmagnuspinnatus_gene_ensembl
## 107 pmarinus_gene_ensembl
## 108 pmexicana_gene_ensembl
## 109 pnattereri_gene_ensembl
## 110 pnyererei_gene_ensembl
## 111 ppaniscus_gene_ensembl
## 112 ppardus_gene_ensembl
## 113 preticulata_gene_ensembl
## 114 psinensis_gene_ensembl
## 115 ptroglodytes_gene_ensembl
## 116 pvampyrus_gene_ensembl
## 117 rbieti_gene_ensembl
## 118 rnorvegicus_gene_ensembl
## 119 rroxellana_gene_ensembl
## 120 saraneus_gene_ensembl
## 121 sboliviensis_gene_ensembl
## 122 scerevisiae_gene_ensembl
## 123 sdorsalis_gene_ensembl
## 124 sdumerili_gene_ensembl
## 125 sformosus_gene_ensembl
## 126 sharrisii_gene_ensembl
## 127 smaximus_gene_ensembl
## 128 spartitus_gene_ensembl
## 129 sscrofa_gene_ensembl
## 130 tbelangeri_gene_ensembl
## 131 tguttata_gene_ensembl
## 132 tnigroviridis_gene_ensembl
## 133 trubripes_gene_ensembl
## 134 ttruncatus_gene_ensembl
## 135 vpacos_gene_ensembl
## 136 xcouchianus_gene_ensembl
## 137 xmaculatus_gene_ensembl
## 138 xtropicalis_gene_ensembl
## description
## 1 Eastern happy genes (fAstCal1.2)
## 2 Anole lizard genes (AnoCar2.0)
## 3 Midas cichlid genes (Midas_v5)
## 4 Panda genes (ailMel1)
## 5 Cave fish genes (Astyanax_mexicanus-2.0)
## 6 Ma's night monkey genes (Anan_2.0)
## 7 Clown anemonefish genes (AmpOce1.0)
## 8 Orange clownfish genes (Nemo_v1)
## 9 Duck genes (BGI_duck_1.0)
## 10 Spiny chromis genes (ASM210954v1)
## 11 Climbing perch genes (fAnaTes1.1)
## 12 Cow genes (UMD3.1)
## 13 Brazilian guinea pig genes (CavAp1.0)
## 14 Sooty mangabey genes (Caty_1.0)
## 15 Capuchin genes (Cebus_imitator-1.0)
## 16 Chinese hamster CHOK1GS genes (CHOK1GS_HDv1)
## 17 Chinese hamster CriGri genes (CriGri_1.0)
## 18 Caenorhabditis elegans genes (WBcel235)
## 19 Dog genes (CanFam3.1)
## 20 Goat genes (ARS1)
## 21 Sloth genes (choHof1)
## 22 C.intestinalis genes (KH)
## 23 Marmoset genes (ASM275486v1)
## 24 Long-tailed chinchilla genes (ChiLan1.0)
## 25 Angola colobus genes (Cang.pa_1.0)
## 26 Guinea Pig genes (Cavpor3.0)
## 27 Vervet-AGM genes (ChlSab1.1)
## 28 C.savignyi genes (CSAV 2.0)
## 29 Tongue sole genes (Cse_v1.0)
## 30 Tarsier genes (Tarsius_syrichta-2.0.1)
## 31 Sheepshead minnow genes (C_variegatus-1.0)
## 32 Fruitfly genes (BDGP6)
## 33 Armadillo genes (Dasnov3.0)
## 34 Kangaroo rat genes (Dord_2.0)
## 35 Zebrafish genes (GRCz11)
## 36 Hagfish genes (Eburgeri_3.2)
## 37 Horse genes (Equ Cab 2)
## 38 Hedgehog genes (eriEur1)
## 39 Northern pike genes (Eluc_V3)
## 40 Lesser hedgehog tenrec genes (TENREC)
## 41 Flycatcher genes (FicAlb_1.4)
## 42 Cat genes (Felis_catus_9.0)
## 43 Damara mole rat genes (DMR_v1.0)
## 44 Mummichog genes (Fundulus_heteroclitus-3.0.2)
## 45 Stickleback genes (BROAD S1)
## 46 Western mosquitofish genes (ASM309773v1)
## 47 Chicken genes (Gallus_gallus-5.0)
## 48 Gorilla genes (gorGor4)
## 49 Cod genes (gadMor1)
## 50 Burton's mouthbrooder genes (AstBur1.0)
## 51 Tiger tail seahorse genes (H_comes_QL1_v1)
## 52 Naked mole-rat female genes (HetGla_female_1.0)
## 53 Naked mole-rat male genes (HetGla_1.0)
## 54 Human genes (GRCh38.p12)
## 55 Channel catfish genes (IpCoco_1.2)
## 56 Squirrel genes (SpeTri2.0)
## 57 Lesser Egyptian jerboa genes (JacJac1.0)
## 58 Mangrove rivulus genes (ASM164957v1)
## 59 Elephant genes (Loxafr3.0)
## 60 Ballan wrasse genes (BallGen_V1)
## 61 Coelacanth genes (LatCha1)
## 62 Spotted gar genes (LepOcu1)
## 63 Swamp eel genes (M_albus_1.0)
## 64 Zig-zag eel genes (fMasArm1.1)
## 65 Golden Hamster genes (MesAur1.0)
## 66 Ryukyu mouse genes (CAROLI_EIJ_v1.1)
## 67 Opossum genes (monDom5)
## 68 Crab-eating macaque genes (Macaca_fascicularis_5.0)
## 69 Ferret genes (MusPutFur1.0)
## 70 Turkey genes (Turkey_2.01)
## 71 Drill genes (Mleu.le_1.0)
## 72 Microbat genes (Myoluc2.0)
## 73 Ocean sunfish genes (ASM169857v1)
## 74 Macaque genes (Mmul_8.0.1)
## 75 Mouse Lemur genes (Mmur_3.0)
## 76 Mouse genes (GRCm38.p6)
## 77 Pig-tailed macaque genes (Mnem_1.0)
## 78 Prairie vole genes (MicOch1.0)
## 79 Shrew mouse genes (PAHARI_EIJ_v1.1)
## 80 Algerian mouse genes (SPRET_EiJ_v1)
## 81 Zebra mbuna genes (M_zebra_UMD2a)
## 82 Lyretail cichlid genes (NeoBri1.0)
## 83 Wallaby genes (Meug_1.0)
## 84 Upper Galilee mountains blind mole rat genes (S.galili_v1.0)
## 85 Gibbon genes (Nleu_3.0)
## 86 Platypus genes (OANA5)
## 87 Sheep genes (Oar_v3.1)
## 88 Rabbit genes (OryCun2.0)
## 89 Degu genes (OctDeg1.0)
## 90 Bushbaby genes (OtoGar3)
## 91 Japanese Medaka HNI genes (ASM223471v1)
## 92 Japanese Medaka HSOK genes (ASM223469v1)
## 93 Japanese Medaka HdrR genes (ASM223467v1)
## 94 Indian medaka genes (Om_v0.7.RACA)
## 95 Tilapia genes (Orenil1.0)
## 96 Pika genes (OchPri2.0-Ens)
## 97 Orangutan genes (PPYG2)
## 98 Tiger genes (PanTig1.0)
## 99 Olive baboon genes (Panu_3.0)
## 100 Northern American deer mouse genes (Pman_1.0)
## 101 Hyrax genes (proCap1)
## 102 Coquerel's sifaka genes (Pcoq_1.0)
## 103 Amazon molly genes (Poecilia_formosa-5.1.2)
## 104 Paramormyrops kingsleyae genes (PKINGS_0.1)
## 105 Sailfin molly genes (P_latipinna-1.0)
## 106 Big-finned mudskipper genes (PM.fa)
## 107 Lamprey genes (Pmarinus_7.0)
## 108 Shortfin molly genes (P_mexicana-1.0)
## 109 Red-bellied piranha genes (Pygocentrus_nattereri-1.0.2)
## 110 Pundamilia nyererei genes (PunNye1.0)
## 111 Bonobo genes (panpan1.1)
## 112 Leopard genes (PanPar1.0)
## 113 Guppy genes (Guppy_female_1.0_MT)
## 114 Chinese softshell turtle genes (PelSin_1.0)
## 115 Chimpanzee genes (Pan_tro_3.0)
## 116 Megabat genes (pteVam1)
## 117 Black snub-nosed monkey genes (ASM169854v1)
## 118 Rat genes (Rnor_6.0)
## 119 Golden snub-nosed monkey genes (Rrox_v1)
## 120 Shrew genes (sorAra1)
## 121 Bolivian squirrel monkey genes (SaiBol1.0)
## 122 Saccharomyces cerevisiae genes (R64-1-1)
## 123 Yellowtail amberjack genes (Sedor1)
## 124 Greater amberjack genes (Sdu_1.0)
## 125 Asian bonytongue genes (ASM162426v1)
## 126 Tasmanian devil genes (Devil_ref v7.0)
## 127 Turbot genes (ASM318616v1)
## 128 Bicolor damselfish genes (Stegastes_partitus-1.0.2)
## 129 Pig genes (Sscrofa11.1)
## 130 Tree Shrew genes (tupBel1)
## 131 Zebra Finch genes (taeGut3.2.4)
## 132 Tetraodon genes (TETRAODON 8.0)
## 133 Fugu genes (FUGU5)
## 134 Dolphin genes (turTru1)
## 135 Alpaca genes (vicPac1)
## 136 Monterrey platyfish genes (Xiphophorus_couchianus-4.0.1)
## 137 Platyfish genes (X_maculatus-5.0-male)
## 138 Xenopus genes (JGI 4.2)
## version
## 1 fAstCal1.2
## 2 AnoCar2.0
## 3 Midas_v5
## 4 ailMel1
## 5 Astyanax_mexicanus-2.0
## 6 Anan_2.0
## 7 AmpOce1.0
## 8 Nemo_v1
## 9 BGI_duck_1.0
## 10 ASM210954v1
## 11 fAnaTes1.1
## 12 UMD3.1
## 13 CavAp1.0
## 14 Caty_1.0
## 15 Cebus_imitator-1.0
## 16 CHOK1GS_HDv1
## 17 CriGri_1.0
## 18 WBcel235
## 19 CanFam3.1
## 20 ARS1
## 21 choHof1
## 22 KH
## 23 ASM275486v1
## 24 ChiLan1.0
## 25 Cang.pa_1.0
## 26 Cavpor3.0
## 27 ChlSab1.1
## 28 CSAV 2.0
## 29 Cse_v1.0
## 30 Tarsius_syrichta-2.0.1
## 31 C_variegatus-1.0
## 32 BDGP6
## 33 Dasnov3.0
## 34 Dord_2.0
## 35 GRCz11
## 36 Eburgeri_3.2
## 37 Equ Cab 2
## 38 eriEur1
## 39 Eluc_V3
## 40 TENREC
## 41 FicAlb_1.4
## 42 Felis_catus_9.0
## 43 DMR_v1.0
## 44 Fundulus_heteroclitus-3.0.2
## 45 BROAD S1
## 46 ASM309773v1
## 47 Gallus_gallus-5.0
## 48 gorGor4
## 49 gadMor1
## 50 AstBur1.0
## 51 H_comes_QL1_v1
## 52 HetGla_female_1.0
## 53 HetGla_1.0
## 54 GRCh38.p12
## 55 IpCoco_1.2
## 56 SpeTri2.0
## 57 JacJac1.0
## 58 ASM164957v1
## 59 Loxafr3.0
## 60 BallGen_V1
## 61 LatCha1
## 62 LepOcu1
## 63 M_albus_1.0
## 64 fMasArm1.1
## 65 MesAur1.0
## 66 CAROLI_EIJ_v1.1
## 67 monDom5
## 68 Macaca_fascicularis_5.0
## 69 MusPutFur1.0
## 70 Turkey_2.01
## 71 Mleu.le_1.0
## 72 Myoluc2.0
## 73 ASM169857v1
## 74 Mmul_8.0.1
## 75 Mmur_3.0
## 76 GRCm38.p6
## 77 Mnem_1.0
## 78 MicOch1.0
## 79 PAHARI_EIJ_v1.1
## 80 SPRET_EiJ_v1
## 81 M_zebra_UMD2a
## 82 NeoBri1.0
## 83 Meug_1.0
## 84 S.galili_v1.0
## 85 Nleu_3.0
## 86 OANA5
## 87 Oar_v3.1
## 88 OryCun2.0
## 89 OctDeg1.0
## 90 OtoGar3
## 91 ASM223471v1
## 92 ASM223469v1
## 93 ASM223467v1
## 94 Om_v0.7.RACA
## 95 Orenil1.0
## 96 OchPri2.0-Ens
## 97 PPYG2
## 98 PanTig1.0
## 99 Panu_3.0
## 100 Pman_1.0
## 101 proCap1
## 102 Pcoq_1.0
## 103 Poecilia_formosa-5.1.2
## 104 PKINGS_0.1
## 105 P_latipinna-1.0
## 106 PM.fa
## 107 Pmarinus_7.0
## 108 P_mexicana-1.0
## 109 Pygocentrus_nattereri-1.0.2
## 110 PunNye1.0
## 111 panpan1.1
## 112 PanPar1.0
## 113 Guppy_female_1.0_MT
## 114 PelSin_1.0
## 115 Pan_tro_3.0
## 116 pteVam1
## 117 ASM169854v1
## 118 Rnor_6.0
## 119 Rrox_v1
## 120 sorAra1
## 121 SaiBol1.0
## 122 R64-1-1
## 123 Sedor1
## 124 Sdu_1.0
## 125 ASM162426v1
## 126 Devil_ref v7.0
## 127 ASM318616v1
## 128 Stegastes_partitus-1.0.2
## 129 Sscrofa11.1
## 130 tupBel1
## 131 taeGut3.2.4
## 132 TETRAODON 8.0
## 133 FUGU5
## 134 turTru1
## 135 vicPac1
## 136 Xiphophorus_couchianus-4.0.1
## 137 X_maculatus-5.0-male
## 138 JGI 4.2
CellType=read.csv('./InputHuAgeDis/CellType_GSE52564_FCover20_Signature.csv', header=T, sep=',')
CellType=CellType[!duplicated(CellType$Gene_mouse),]
rownames(CellType)=CellType$Gene_mouse
mouse = useMart('ENSEMBL_MART_ENSEMBL', dataset = 'mmusculus_gene_ensembl')
Human = useMart('ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl')
myMap = getLDS(attributes = "mgi_symbol", filters = 'mgi_symbol',
values = CellType$Gene_mouse, mart = mouse,
attributesL = c("hgnc_symbol"), martL = Human)
#Some genes will be duplicates (HLA), so let's make them unique.
myMap=myMap[!duplicated(myMap$HGNC.symbol), ]
#unique(myMap$HGNC.symbol)
myMap_new<-myMap
names(myMap_new) <- c("Gene_mouse", "HuGene")
DT::datatable(head(myMap_new))
CellType_Hu=merge(CellType,myMap_new,by='Gene_mouse')
colnames(CellType_Hu)[colnames(CellType_Hu)=="Gene_mouse"] <- "MsGene"
CellType_Hu=CellType_Hu[c("HuGene","CellType_Vs_Others","MsGene")]
CellType_Hu=CellType_Hu[!duplicated(CellType_Hu$HuGene), ]
rownames(CellType_Hu)=CellType_Hu$HuGene
CellType_Hu=CellType_Hu[,-1]
write.csv(CellType_Hu,"CellType_GSE52564_FCover20_Signature_Hu.csv")
celltype=read.csv("CellType_GSE52564_FCover20_Signature_Hu.csv",sep=',')
cellTypeGeneCount=as.data.frame(table(celltype$CellType_Vs_Others))
cellTypeGeneCount
## Var1 Freq
## 1 AstroByothersCuffdiff2FCover20 452
## 2 EndotheByothersCuffdiff2FCover20 621
## 3 MicrogliByothersCuffdiff2FCover20 1149
## 4 MyeOligoByothersCuffdiff2FCover20 263
## 5 NeuronByothersCuffdiff2FCover20 709
## 6 NewOligoByothersCuffdiff2FCover20 82
## 7 PreOligoByothersCuffdiff2FCover20 116
write.csv(cellTypeGeneCount,"CellType_GSE52564_FCover20_Signature_Hu_GeneCount.csv")
#This is the actual calulation of hypergeometric enrichment
enrichmentsCellType = userListEnrichment(rownames(datExprA1g2), modulesA1,"CellType_GSE52564_FCover20_Signature_Hu.csv", "", "enrichmentsCellType_Sigificantcolors.csv")
## 77 comparisons were successfully performed.
#List of genes that overlap between modules and cell type
enrichmentsCellType_OvGenes=enrichmentsCellType[[2]]
dir.create("./enrichmentsCellType_OvGenes")
for (i in 1:length(enrichmentsCellType_OvGenes)) {
write.csv(enrichmentsCellType_OvGenes[i], file=paste0("enrichmentsCellType_OvGenes/", names(enrichmentsCellType_OvGenes)[i], ".txt"))
}
#List of genes that overlap between modules and cell type
enrichmentsCellType_NumOvGenes_pvalue=enrichmentsCellType[[1]]
write.csv(enrichmentsCellType_NumOvGenes_pvalue,"enrichments_NumOvGenes_pvalue_colors.csv")
geneListsModuleSummary=read.csv('geneListsModuleSummarycolors.csv', sep=',')
enrichmentsSummary=read.csv('enrichments_NumOvGenes_pvalue_colors.csv', sep=',')
geneListsModuleSummaryMod=table(geneListsModuleSummary$modulesA1)
geneListsModuleSummaryMod=as.data.frame(geneListsModuleSummaryMod)
write.csv(geneListsModuleSummaryMod, "geneListsModuleSummarycolors_GeneCount.csv")
colnames(geneListsModuleSummaryMod)=c("InputCategories", "Freq")
geneListsModuleSummaryMod
## InputCategories Freq
## 1 black 113
## 2 blue 470
## 3 brown 413
## 4 green 239
## 5 greenyellow 49
## 6 grey 3877
## 7 magenta 69
## 8 pink 82
## 9 purple 58
## 10 red 175
## 11 turquoise 1501
## 12 yellow 255
enrichmentsSummarymergeMod=merge(enrichmentsSummary,geneListsModuleSummaryMod, by="InputCategories")
write.csv(enrichmentsSummarymergeMod,"enrichments_SummarymergeMod_colors.csv")
enrichmentsCellTypeL = userListEnrichment(rownames(datExprA1g2), modulesA1L,"CellType_GSE52564_FCover20_Signature_Hu.csv", "", "enrichmentsCellType_Sigificantlabels.csv")
## 84 comparisons were successfully performed.
enrichmentsCellType_OvGenesL=enrichmentsCellTypeL[[2]]
for (i in 1:length(enrichmentsCellType_OvGenesL)) {
write.csv(enrichmentsCellType_OvGenesL[i], file=paste0("enrichmentsCellType_OvGenes/", names(enrichmentsCellType_OvGenesL)[i], ".txt"))
}
enrichmentsCellType_NumOvGenes_pvalueL=enrichmentsCellTypeL[[1]]
write.csv(enrichmentsCellType_NumOvGenes_pvalueL,"enrichments_NumOvGenes_pvalue_labels.csv")
geneListsModuleSummaryL=read.csv('geneListsModuleSummarylabels.csv', sep=',')
enrichmentsSummaryL=read.csv('enrichments_NumOvGenes_pvalue_labels.csv', sep=',')
geneListsModuleSummaryModL=table(geneListsModuleSummaryL$modulesA1L)
geneListsModuleSummaryModL=as.data.frame(geneListsModuleSummaryModL)
write.csv(geneListsModuleSummaryModL, "geneListsModuleSummarylabels_GeneCount.csv")
colnames(geneListsModuleSummaryModL)=c("InputCategories", "Freq")
enrichmentsSummarymergeModL=merge(enrichmentsSummaryL,geneListsModuleSummaryModL, by="InputCategories")
write.csv(enrichmentsSummarymergeModL,"enrichments_SummarymergeMod_labels.csv")
#Export expression file
write.csv(datExprA1g2,"ForPres&Hyper_HuAgeDis_datExprA1g2.csv")
#Export the module and gene names for use Preservation and hypergeometric test
write.csv(kMEtable1[,c("PSID","Gene","Module")],"ForPres&Hyper_HuAgeDis_GeneModule.csv")
write.csv(kMEtable1L[,c("PSID","Gene","Module")],"ForPres&Hyper_HuAgeDis_GeneModuleL.csv")
below select black module which has microglia genes
dbs <- listEnrichrDbs()
dbs
## libraryName numTerms
## 1 Genome_Browser_PWMs 615
## 2 TRANSFAC_and_JASPAR_PWMs 326
## 3 Transcription_Factor_PPIs 290
## 4 ChEA_2013 353
## 5 Drug_Perturbations_from_GEO_2014 701
## 6 ENCODE_TF_ChIP-seq_2014 498
## 7 BioCarta_2013 249
## 8 Reactome_2013 78
## 9 WikiPathways_2013 199
## 10 Disease_Signatures_from_GEO_up_2014 142
## 11 KEGG_2013 200
## 12 TF-LOF_Expression_from_GEO 269
## 13 TargetScan_microRNA 222
## 14 PPI_Hub_Proteins 385
## 15 GO_Molecular_Function_2015 1136
## 16 GeneSigDB 2139
## 17 Chromosome_Location 386
## 18 Human_Gene_Atlas 84
## 19 Mouse_Gene_Atlas 96
## 20 GO_Cellular_Component_2015 641
## 21 GO_Biological_Process_2015 5192
## 22 Human_Phenotype_Ontology 1779
## 23 Epigenomics_Roadmap_HM_ChIP-seq 383
## 24 KEA_2013 474
## 25 NURSA_Human_Endogenous_Complexome 1796
## 26 CORUM 1658
## 27 SILAC_Phosphoproteomics 84
## 28 MGI_Mammalian_Phenotype_Level_3 71
## 29 MGI_Mammalian_Phenotype_Level_4 476
## 30 Old_CMAP_up 6100
## 31 Old_CMAP_down 6100
## 32 OMIM_Disease 90
## 33 OMIM_Expanded 187
## 34 VirusMINT 85
## 35 MSigDB_Computational 858
## 36 MSigDB_Oncogenic_Signatures 189
## 37 Disease_Signatures_from_GEO_down_2014 142
## 38 Virus_Perturbations_from_GEO_up 323
## 39 Virus_Perturbations_from_GEO_down 323
## 40 Cancer_Cell_Line_Encyclopedia 967
## 41 NCI-60_Cancer_Cell_Lines 93
## 42 Tissue_Protein_Expression_from_ProteomicsDB 207
## 43 Tissue_Protein_Expression_from_Human_Proteome_Map 30
## 44 HMDB_Metabolites 3906
## 45 Pfam_InterPro_Domains 311
## 46 GO_Biological_Process_2013 941
## 47 GO_Cellular_Component_2013 205
## 48 GO_Molecular_Function_2013 402
## 49 Allen_Brain_Atlas_up 2192
## 50 ENCODE_TF_ChIP-seq_2015 816
## 51 ENCODE_Histone_Modifications_2015 412
## 52 Phosphatase_Substrates_from_DEPOD 59
## 53 Allen_Brain_Atlas_down 2192
## 54 ENCODE_Histone_Modifications_2013 109
## 55 Achilles_fitness_increase 216
## 56 Achilles_fitness_decrease 216
## 57 MGI_Mammalian_Phenotype_2013 476
## 58 BioCarta_2015 239
## 59 HumanCyc_2015 125
## 60 KEGG_2015 179
## 61 NCI-Nature_2015 209
## 62 Panther_2015 104
## 63 WikiPathways_2015 404
## 64 Reactome_2015 1389
## 65 ESCAPE 315
## 66 HomoloGene 12
## 67 Disease_Perturbations_from_GEO_down 839
## 68 Disease_Perturbations_from_GEO_up 839
## 69 Drug_Perturbations_from_GEO_down 906
## 70 Genes_Associated_with_NIH_Grants 32876
## 71 Drug_Perturbations_from_GEO_up 906
## 72 KEA_2015 428
## 73 Single_Gene_Perturbations_from_GEO_up 2460
## 74 Single_Gene_Perturbations_from_GEO_down 2460
## 75 ChEA_2015 395
## 76 dbGaP 345
## 77 LINCS_L1000_Chem_Pert_up 33132
## 78 LINCS_L1000_Chem_Pert_down 33132
## 79 GTEx_Tissue_Sample_Gene_Expression_Profiles_down 2918
## 80 GTEx_Tissue_Sample_Gene_Expression_Profiles_up 2918
## 81 Ligand_Perturbations_from_GEO_down 261
## 82 Aging_Perturbations_from_GEO_down 286
## 83 Aging_Perturbations_from_GEO_up 286
## 84 Ligand_Perturbations_from_GEO_up 261
## 85 MCF7_Perturbations_from_GEO_down 401
## 86 MCF7_Perturbations_from_GEO_up 401
## 87 Microbe_Perturbations_from_GEO_down 312
## 88 Microbe_Perturbations_from_GEO_up 312
## 89 LINCS_L1000_Ligand_Perturbations_down 96
## 90 LINCS_L1000_Ligand_Perturbations_up 96
## 91 LINCS_L1000_Kinase_Perturbations_down 3644
## 92 LINCS_L1000_Kinase_Perturbations_up 3644
## 93 Reactome_2016 1530
## 94 KEGG_2016 293
## 95 WikiPathways_2016 437
## 96 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X 104
## 97 Kinase_Perturbations_from_GEO_down 285
## 98 Kinase_Perturbations_from_GEO_up 285
## 99 BioCarta_2016 237
## 100 HumanCyc_2016 152
## 101 NCI-Nature_2016 209
## 102 Panther_2016 112
## 103 DrugMatrix 7876
## 104 ChEA_2016 645
## 105 huMAP 995
## 106 Jensen_TISSUES 1842
## 107 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO 1302
## 108 MGI_Mammalian_Phenotype_2017 5231
## 109 Jensen_COMPARTMENTS 2283
## 110 Jensen_DISEASES 1811
## 111 BioPlex_2017 3915
## 112 GO_Cellular_Component_2017 636
## 113 GO_Molecular_Function_2017 972
## 114 GO_Biological_Process_2017 3166
## 115 GO_Cellular_Component_2017b 816
## 116 GO_Molecular_Function_2017b 3271
## 117 GO_Biological_Process_2017b 10125
## 118 ARCHS4_Tissues 108
## 119 ARCHS4_Cell-lines 125
## 120 ARCHS4_IDG_Coexp 352
## 121 ARCHS4_Kinases_Coexp 498
## 122 ARCHS4_TFs_Coexp 1724
## 123 SysMyo_Muscle_Gene_Sets 1135
## 124 miRTarBase_2017 3240
## 125 TargetScan_microRNA_2017 683
## 126 Enrichr_Libraries_Most_Popular_Genes 121
## 127 Enrichr_Submissions_TF-Gene_Coocurrence 1722
## 128 Data_Acquisition_Method_Most_Popular_Genes 12
## 129 DSigDB 4026
## 130 GO_Biological_Process_2018 5103
## 131 GO_Cellular_Component_2018 446
## 132 GO_Molecular_Function_2018 1151
## 133 TF_Perturbations_Followed_by_Expression 1958
## geneCoverage genesPerTerm
## 1 13362 275
## 2 27884 1284
## 3 6002 77
## 4 47172 1370
## 5 47107 509
## 6 21493 3713
## 7 1295 18
## 8 3185 73
## 9 2854 34
## 10 15057 300
## 11 4128 48
## 12 34061 641
## 13 7504 155
## 14 16399 247
## 15 12753 57
## 16 23726 127
## 17 32740 85
## 18 13373 258
## 19 19270 388
## 20 13236 82
## 21 14264 58
## 22 3096 31
## 23 22288 4368
## 24 4533 37
## 25 10231 158
## 26 2741 5
## 27 5655 342
## 28 10406 715
## 29 10493 200
## 30 11251 100
## 31 8695 100
## 32 1759 25
## 33 2178 89
## 34 851 15
## 35 10061 106
## 36 11250 166
## 37 15406 300
## 38 17711 300
## 39 17576 300
## 40 15797 176
## 41 12232 343
## 42 13572 301
## 43 6454 301
## 44 3723 47
## 45 7588 35
## 46 7682 78
## 47 7324 172
## 48 8469 122
## 49 13121 305
## 50 26382 1811
## 51 29065 2123
## 52 280 9
## 53 13877 304
## 54 15852 912
## 55 4320 129
## 56 4271 128
## 57 10496 201
## 58 1678 21
## 59 756 12
## 60 3800 48
## 61 2541 39
## 62 1918 39
## 63 5863 51
## 64 6768 47
## 65 25651 807
## 66 19129 1594
## 67 23939 293
## 68 23561 307
## 69 23877 302
## 70 15886 9
## 71 24350 299
## 72 3102 25
## 73 31132 298
## 74 30832 302
## 75 48230 1429
## 76 5613 36
## 77 9559 73
## 78 9448 63
## 79 16725 1443
## 80 19249 1443
## 81 15090 282
## 82 16129 292
## 83 15309 308
## 84 15103 318
## 85 15022 290
## 86 15676 310
## 87 15854 279
## 88 15015 321
## 89 3788 159
## 90 3357 153
## 91 12668 300
## 92 12638 300
## 93 8973 64
## 94 7010 87
## 95 5966 51
## 96 15562 887
## 97 17850 300
## 98 17660 300
## 99 1348 19
## 100 934 13
## 101 2541 39
## 102 2041 42
## 103 5209 300
## 104 49238 1550
## 105 2243 19
## 106 19586 545
## 107 22440 505
## 108 8184 24
## 109 18329 161
## 110 15755 28
## 111 10271 22
## 112 10427 38
## 113 10601 25
## 114 13822 21
## 115 8002 143
## 116 10089 45
## 117 13247 49
## 118 21809 2316
## 119 23601 2395
## 120 20883 299
## 121 19612 299
## 122 25983 299
## 123 19500 137
## 124 14893 128
## 125 17598 1208
## 126 5902 109
## 127 12486 299
## 128 1073 100
## 129 19513 117
## 130 14433 36
## 131 8655 61
## 132 11459 39
## 133 19741 270
## link
## 1 http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/
## 2 http://jaspar.genereg.net/html/DOWNLOAD/
## 3
## 4 http://amp.pharm.mssm.edu/lib/cheadownload.jsp
## 5 http://www.ncbi.nlm.nih.gov/geo/
## 6 http://genome.ucsc.edu/ENCODE/downloads.html
## 7 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways
## 8 http://www.reactome.org/download/index.html
## 9 http://www.wikipathways.org/index.php/Download_Pathways
## 10 http://www.ncbi.nlm.nih.gov/geo/
## 11 http://www.kegg.jp/kegg/download/
## 12 http://www.ncbi.nlm.nih.gov/geo/
## 13 http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61
## 14 http://amp.pharm.mssm.edu/X2K
## 15 http://www.geneontology.org/GO.downloads.annotations.shtml
## 16 http://genesigdb.org/genesigdb/downloadall.jsp
## 17 http://software.broadinstitute.org/gsea/msigdb/index.jsp
## 18 http://biogps.org/downloads/
## 19 http://biogps.org/downloads/
## 20 http://www.geneontology.org/GO.downloads.annotations.shtml
## 21 http://www.geneontology.org/GO.downloads.annotations.shtml
## 22 http://www.human-phenotype-ontology.org/
## 23 http://www.roadmapepigenomics.org/
## 24 http://amp.pharm.mssm.edu/lib/keacommandline.jsp
## 25 https://www.nursa.org/nursa/index.jsf
## 26 http://mips.helmholtz-muenchen.de/genre/proj/corum/
## 27 http://amp.pharm.mssm.edu/lib/keacommandline.jsp
## 28 http://www.informatics.jax.org/
## 29 http://www.informatics.jax.org/
## 30 http://www.broadinstitute.org/cmap/
## 31 http://www.broadinstitute.org/cmap/
## 32 http://www.omim.org/downloads
## 33 http://www.omim.org/downloads
## 34 http://mint.bio.uniroma2.it/download.html
## 35 http://www.broadinstitute.org/gsea/msigdb/collections.jsp
## 36 http://www.broadinstitute.org/gsea/msigdb/collections.jsp
## 37 http://www.ncbi.nlm.nih.gov/geo/
## 38 http://www.ncbi.nlm.nih.gov/geo/
## 39 http://www.ncbi.nlm.nih.gov/geo/
## 40 https://portals.broadinstitute.org/ccle/home\n
## 41 http://biogps.org/downloads/
## 42 https://www.proteomicsdb.org/
## 43 http://www.humanproteomemap.org/index.php
## 44 http://www.hmdb.ca/downloads
## 45 ftp://ftp.ebi.ac.uk/pub/databases/interpro/
## 46 http://www.geneontology.org/GO.downloads.annotations.shtml
## 47 http://www.geneontology.org/GO.downloads.annotations.shtml
## 48 http://www.geneontology.org/GO.downloads.annotations.shtml
## 49 http://www.brain-map.org/
## 50 http://genome.ucsc.edu/ENCODE/downloads.html
## 51 http://genome.ucsc.edu/ENCODE/downloads.html
## 52 http://www.koehn.embl.de/depod/
## 53 http://www.brain-map.org/
## 54 http://genome.ucsc.edu/ENCODE/downloads.html
## 55 http://www.broadinstitute.org/achilles
## 56 http://www.broadinstitute.org/achilles
## 57 http://www.informatics.jax.org/
## 58 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways
## 59 http://humancyc.org/
## 60 http://www.kegg.jp/kegg/download/
## 61 http://pid.nci.nih.gov/
## 62 http://www.pantherdb.org/
## 63 http://www.wikipathways.org/index.php/Download_Pathways
## 64 http://www.reactome.org/download/index.html
## 65 http://www.maayanlab.net/ESCAPE/
## 66 http://www.ncbi.nlm.nih.gov/homologene
## 67 http://www.ncbi.nlm.nih.gov/geo/
## 68 http://www.ncbi.nlm.nih.gov/geo/
## 69 http://www.ncbi.nlm.nih.gov/geo/
## 70 https://grants.nih.gov/grants/oer.htm\n
## 71 http://www.ncbi.nlm.nih.gov/geo/
## 72 http://amp.pharm.mssm.edu/Enrichr
## 73 http://www.ncbi.nlm.nih.gov/geo/
## 74 http://www.ncbi.nlm.nih.gov/geo/
## 75 http://amp.pharm.mssm.edu/Enrichr
## 76 http://www.ncbi.nlm.nih.gov/gap
## 77 https://clue.io/
## 78 https://clue.io/
## 79 http://www.gtexportal.org/
## 80 http://www.gtexportal.org/
## 81 http://www.ncbi.nlm.nih.gov/geo/
## 82 http://www.ncbi.nlm.nih.gov/geo/
## 83 http://www.ncbi.nlm.nih.gov/geo/
## 84 http://www.ncbi.nlm.nih.gov/geo/
## 85 http://www.ncbi.nlm.nih.gov/geo/
## 86 http://www.ncbi.nlm.nih.gov/geo/
## 87 http://www.ncbi.nlm.nih.gov/geo/
## 88 http://www.ncbi.nlm.nih.gov/geo/
## 89 https://clue.io/
## 90 https://clue.io/
## 91 https://clue.io/
## 92 https://clue.io/
## 93 http://www.reactome.org/download/index.html
## 94 http://www.kegg.jp/kegg/download/
## 95 http://www.wikipathways.org/index.php/Download_Pathways
## 96
## 97 http://www.ncbi.nlm.nih.gov/geo/
## 98 http://www.ncbi.nlm.nih.gov/geo/
## 99 http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways
## 100 http://humancyc.org/
## 101 http://pid.nci.nih.gov/
## 102 http://www.pantherdb.org/pathway/
## 103 https://ntp.niehs.nih.gov/drugmatrix/
## 104 http://amp.pharm.mssm.edu/Enrichr
## 105 http://proteincomplexes.org/
## 106 http://tissues.jensenlab.org/
## 107 http://www.ncbi.nlm.nih.gov/geo/
## 108 http://www.informatics.jax.org/
## 109 http://compartments.jensenlab.org/
## 110 http://diseases.jensenlab.org/
## 111 http://bioplex.hms.harvard.edu/
## 112 http://www.geneontology.org/
## 113 http://www.geneontology.org/
## 114 http://www.geneontology.org/
## 115 http://www.geneontology.org/
## 116 http://www.geneontology.org/
## 117 http://www.geneontology.org/
## 118 http://amp.pharm.mssm.edu/archs4
## 119 http://amp.pharm.mssm.edu/archs4
## 120 http://amp.pharm.mssm.edu/archs4
## 121 http://amp.pharm.mssm.edu/archs4
## 122 http://amp.pharm.mssm.edu/archs4
## 123 http://sys-myo.rhcloud.com/
## 124 http://mirtarbase.mbc.nctu.edu.tw/
## 125 http://www.targetscan.org/
## 126 http://amp.pharm.mssm.edu/Enrichr
## 127 http://amp.pharm.mssm.edu/Enrichr
## 128 http://amp.pharm.mssm.edu/Enrichr
## 129 http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/
## 130 http://www.geneontology.org/
## 131 http://www.geneontology.org/
## 132 http://www.geneontology.org/
## 133 http://www.ncbi.nlm.nih.gov/geo/
#select pathway databases of choice from above
dbs_select<-c("GO_Biological_Process_2018", "KEGG_2016", "Reactome_2016")
SelectGeneListA1=read.csv('geneListsModuleSummarycolors.csv',header=T, sep=',')
SelectGeneListA1=SelectGeneListA1$Gene[modulesA1=='black']
#compare for enrichment with our Genes
enriched <- enrichr(SelectGeneListA1, dbs_select)
## Uploading data to Enrichr... Done.
## Querying GO_Biological_Process_2018... Done.
## Querying KEGG_2016... Done.
## Querying Reactome_2016... Done.
## Parsing results... Done.
#GO Biological Process results
SelectGeneListA1_GOBiologicalProcess<-as.data.frame(enriched[["GO_Biological_Process_2018"]])
write.csv(SelectGeneListA1_GOBiologicalProcess,file="SelectGeneListA1_GOBiologicalProcess_black.csv")
GOBiologicalProcessData<-SelectGeneListA1_GOBiologicalProcess[order(-SelectGeneListA1_GOBiologicalProcess$Combined.Score),]
GOBiologicalProcessData<-GOBiologicalProcessData[1:10,]
ggplot(data=GOBiologicalProcessData, aes(x=Term, y=-log10(Adjusted.P.value), fill=Combined.Score))+
geom_bar(position="stack", stat="identity")+
labs(title="GOBiologicalProcess Comparison")+
#theme(axis.text.x=element_text(angle=90,hjust=1)) +
coord_flip()+
labs(y="-log10(Adjusted.P.value)")+
scale_fill_gradient(low="grey",high="black")
dev.print(pdf,"SelectGeneListA1_GOBiologicalProcessTop10plot_black.pdf", width=15, height=5)
## quartz_off_screen
## 2
#KEEG pathway results
SelectGeneListA1_KEEGpathway<-as.data.frame(enriched[["KEGG_2016"]])
write.csv(SelectGeneListA1_KEEGpathway,file="SelectGeneListA1_KEEGpathway_black.csv")
KEEGpathwayData<-SelectGeneListA1_KEEGpathway[order(-SelectGeneListA1_KEEGpathway$Combined.Score),]
KEEGpathwayData<-KEEGpathwayData[1:10,]
ggplot(data=KEEGpathwayData, aes(x=Term, y=-log10(Adjusted.P.value), fill=Combined.Score))+
geom_bar(position="stack", stat="identity")+
labs(title="KEEGpathway Comparison")+
#theme(axis.text.x=element_text(angle=90,hjust=1)) +
coord_flip()+
labs(y="-log10(Adjusted.P.value)")+
scale_fill_gradient(low="grey",high="black")
dev.print(pdf,"SelectGeneListA1_KEEGpathwayTop10plot_black.pdf", width=10, height=5)
## quartz_off_screen
## 2
#Reactome pathway results
SelectGeneListA1_Reactome<-as.data.frame(enriched[["Reactome_2016"]])
write.csv(SelectGeneListA1_Reactome,"SelectGeneListA1_Reactome_black.csv")
ReactomeData<-SelectGeneListA1_Reactome[order(-SelectGeneListA1_Reactome$Combined.Score),]
ReactomeData<-ReactomeData[1:10,]
ggplot(data=ReactomeData, aes(x=Term, y=-log10(Adjusted.P.value), fill=Combined.Score))+
geom_bar(position="stack", stat="identity")+
labs(title="Reactome Comparison")+
#theme(axis.text.x=element_text(angle=90,hjust=1)) +
coord_flip()+
labs(y="-log10(Adjusted.P.value)")+
scale_fill_gradient(low="grey",high="black")
dev.print(pdf,"SelectGeneListA1_ReactomeTop10plot_black.pdf", width=10, height=5)
## quartz_off_screen
## 2
hub genes from black module which is aging and disease associated ahd has microglia genes
#Microglia genes in black module are stored in a variable for plotting. This can be replace with any choice of genes present in a given module.
black_Microglia_hubgenes=c("TBXAS1","PTPRC","TYROBP","ITGB2","MYO1F","CYBA","LST1","SLC7A7","RNASET2","FCER1G")
summary(black_Microglia_hubgenes)
## Length Class Mode
## 10 character character
#view metadata and expression data A1
DT::datatable(metadataA1g_coded)
DT::datatable(datExprA1g2)